diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py deleted file mode 100644 index f1122c3..0000000 --- a/swh/indexer/metadata_dictionary.py +++ /dev/null @@ -1,731 +0,0 @@ -# Copyright (C) 2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import os -import re -import abc -import ast -import json -import logging -import itertools -import collections -import email.parser -import email.policy -import xml.parsers.expat - -import click -import xmltodict - -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI, CODEMETA_TERMS -from swh.indexer.codemeta import compact, expand - - -MAPPINGS = {} - - -def register_mapping(cls): - MAPPINGS[cls.__name__] = cls - return cls - - -def list_terms(): - """Returns a dictionary with all supported CodeMeta terms as keys, - and the mappings that support each of them as values.""" - d = collections.defaultdict(set) - for mapping in MAPPINGS.values(): - for term in mapping.supported_terms(): - d[term].add(mapping) - return d - - -def merge_values(v1, v2): - """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, - returns `{"@list": l1 + l2}`. - Otherwise, make them lists (if they are not already) and concatenate - them. - - >>> merge_values('a', 'b') - ['a', 'b'] - >>> merge_values(['a', 'b'], 'c') - ['a', 'b', 'c'] - >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) - {'@list': ['a', 'b', 'c']} - """ - if v1 is None: - return v2 - elif v2 is None: - return v1 - elif isinstance(v1, dict) and set(v1) == {'@list'}: - assert isinstance(v1['@list'], list) - if isinstance(v2, dict) and set(v2) == {'@list'}: - assert isinstance(v2['@list'], list) - return {'@list': v1['@list'] + v2['@list']} - else: - raise ValueError('Cannot merge %r and %r' % (v1, v2)) - else: - if isinstance(v2, dict) and '@list' in v2: - raise ValueError('Cannot merge %r and %r' % (v1, v2)) - if not isinstance(v1, list): - v1 = [v1] - if not isinstance(v2, list): - v2 = [v2] - return v1 + v2 - - -class BaseMapping(metaclass=abc.ABCMeta): - """Base class for mappings to inherit from - - To implement a new mapping: - - - inherit this class - - override translate function - """ - def __init__(self, log_suffix=''): - self.log_suffix = log_suffix - self.log = logging.getLogger('%s.%s' % ( - self.__class__.__module__, - self.__class__.__name__)) - - @property - @abc.abstractmethod - def name(self): - """A name of this mapping, used as an identifier in the - indexer storage.""" - pass - - @classmethod - @abc.abstractmethod - def detect_metadata_files(cls, files): - """ - Detects files potentially containing metadata - - Args: - file_entries (list): list of files - - Returns: - list: list of sha1 (possibly empty) - """ - pass - - @abc.abstractmethod - def translate(self, file_content): - pass - - def normalize_translation(self, metadata): - return compact(metadata) - - -class SingleFileMapping(BaseMapping): - """Base class for all mappings that use a single file as input.""" - - @property - @abc.abstractmethod - def filename(self): - """The .json file to extract metadata from.""" - pass - - @classmethod - def detect_metadata_files(cls, file_entries): - for entry in file_entries: - if entry['name'] == cls.filename: - return [entry['sha1']] - return [] - - -class DictMapping(BaseMapping): - """Base class for mappings that take as input a file that is mostly - a key-value store (eg. a shallow JSON dict).""" - - string_fields = [] - '''List of fields that are simple strings, and don't need any - normalization.''' - - @property - @abc.abstractmethod - def mapping(self): - """A translation dict to map dict keys into a canonical name.""" - pass - - @staticmethod - def _normalize_method_name(name): - return name.replace('-', '_') - - @classmethod - def supported_terms(cls): - return { - term for (key, term) in cls.mapping.items() - if key in cls.string_fields - or hasattr(cls, 'translate_' + cls._normalize_method_name(key)) - or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))} - - def _translate_dict(self, content_dict, *, normalize=True): - """ - Translates content by parsing content from a dict object - and translating with the appropriate mapping - - Args: - content_dict (dict): content dict to translate - - Returns: - dict: translated metadata in json-friendly form needed for - the indexer - - """ - translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} - for k, v in content_dict.items(): - # First, check if there is a specific translation - # method for this key - translation_method = getattr( - self, 'translate_' + self._normalize_method_name(k), None) - if translation_method: - translation_method(translated_metadata, v) - elif k in self.mapping: - # if there is no method, but the key is known from the - # crosswalk table - codemeta_key = self.mapping[k] - - # if there is a normalization method, use it on the value - normalization_method = getattr( - self, 'normalize_' + self._normalize_method_name(k), None) - if normalization_method: - v = normalization_method(v) - elif k in self.string_fields and isinstance(v, str): - pass - elif k in self.string_fields and isinstance(v, list): - v = [x for x in v if isinstance(x, str)] - else: - continue - - # set the translation metadata with the normalized value - if codemeta_key in translated_metadata: - translated_metadata[codemeta_key] = merge_values( - translated_metadata[codemeta_key], v) - else: - translated_metadata[codemeta_key] = v - if normalize: - return self.normalize_translation(translated_metadata) - else: - return translated_metadata - - -class JsonMapping(DictMapping, SingleFileMapping): - """Base class for all mappings that use a JSON file as input.""" - - def translate(self, raw_content): - """ - Translates content by parsing content from a bytestring containing - json data and translating with the appropriate mapping - - Args: - raw_content (bytes): raw content to translate - - Returns: - dict: translated metadata in json-friendly form needed for - the indexer - - """ - try: - raw_content = raw_content.decode() - except UnicodeDecodeError: - self.log.warning('Error unidecoding from %s', self.log_suffix) - return - try: - content_dict = json.loads(raw_content) - except json.JSONDecodeError: - self.log.warning('Error unjsoning from %s', self.log_suffix) - return - if isinstance(content_dict, dict): - return self._translate_dict(content_dict) - - -@register_mapping -class NpmMapping(JsonMapping): - """ - dedicated class for NPM (package.json) mapping and translation - """ - name = 'npm' - mapping = CROSSWALK_TABLE['NodeJS'] - filename = b'package.json' - string_fields = ['name', 'version', 'homepage', 'description', 'email'] - - _schema_shortcuts = { - 'github': 'git+https://github.com/%s.git', - 'gist': 'git+https://gist.github.com/%s.git', - 'gitlab': 'git+https://gitlab.com/%s.git', - # Bitbucket supports both hg and git, and the shortcut does not - # tell which one to use. - # 'bitbucket': 'https://bitbucket.org/', - } - - def normalize_repository(self, d): - """https://docs.npmjs.com/files/package.json#repository - - >>> NpmMapping().normalize_repository({ - ... 'type': 'git', - ... 'url': 'https://example.org/foo.git' - ... }) - {'@id': 'git+https://example.org/foo.git'} - >>> NpmMapping().normalize_repository( - ... 'gitlab:foo/bar') - {'@id': 'git+https://gitlab.com/foo/bar.git'} - >>> NpmMapping().normalize_repository( - ... 'foo/bar') - {'@id': 'git+https://github.com/foo/bar.git'} - """ - if isinstance(d, dict) and isinstance(d.get('type'), str) \ - and isinstance(d.get('url'), str): - url = '{type}+{url}'.format(**d) - elif isinstance(d, str): - if '://' in d: - url = d - elif ':' in d: - (schema, rest) = d.split(':', 1) - if schema in self._schema_shortcuts: - url = self._schema_shortcuts[schema] % rest - else: - return None - else: - url = self._schema_shortcuts['github'] % d - - else: - return None - - return {'@id': url} - - def normalize_bugs(self, d): - """https://docs.npmjs.com/files/package.json#bugs - - >>> NpmMapping().normalize_bugs({ - ... 'url': 'https://example.org/bugs/', - ... 'email': 'bugs@example.org' - ... }) - {'@id': 'https://example.org/bugs/'} - >>> NpmMapping().normalize_bugs( - ... 'https://example.org/bugs/') - {'@id': 'https://example.org/bugs/'} - """ - if isinstance(d, dict) and isinstance(d.get('url'), str): - return {'@id': d['url']} - elif isinstance(d, str): - return {'@id': d} - else: - return None - - _parse_author = re.compile(r'^ *' - r'(?P.*?)' - r'( +<(?P.*)>)?' - r'( +\((?P.*)\))?' - r' *$') - - def normalize_author(self, d): - """https://docs.npmjs.com/files/package.json#people-fields-author-contributors' - - >>> from pprint import pprint - >>> pprint(NpmMapping().normalize_author({ - ... 'name': 'John Doe', - ... 'email': 'john.doe@example.org', - ... 'url': 'https://example.org/~john.doe', - ... })) - {'@list': [{'@type': 'http://schema.org/Person', - 'http://schema.org/email': 'john.doe@example.org', - 'http://schema.org/name': 'John Doe', - 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]} - >>> pprint(NpmMapping().normalize_author( - ... 'John Doe (https://example.org/~john.doe)' - ... )) - {'@list': [{'@type': 'http://schema.org/Person', - 'http://schema.org/email': 'john.doe@example.org', - 'http://schema.org/name': 'John Doe', - 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]} - """ # noqa - author = {'@type': SCHEMA_URI+'Person'} - if isinstance(d, dict): - name = d.get('name', None) - email = d.get('email', None) - url = d.get('url', None) - elif isinstance(d, str): - match = self._parse_author.match(d) - name = match.group('name') - email = match.group('email') - url = match.group('url') - else: - return None - if name and isinstance(name, str): - author[SCHEMA_URI+'name'] = name - if email and isinstance(email, str): - author[SCHEMA_URI+'email'] = email - if url and isinstance(url, str): - author[SCHEMA_URI+'url'] = {'@id': url} - return {"@list": [author]} - - def normalize_license(self, s): - """https://docs.npmjs.com/files/package.json#license - - >>> NpmMapping().normalize_license('MIT') - {'@id': 'https://spdx.org/licenses/MIT'} - """ - if isinstance(s, str): - return {"@id": "https://spdx.org/licenses/" + s} - - def normalize_homepage(self, s): - """https://docs.npmjs.com/files/package.json#homepage - - >>> NpmMapping().normalize_homepage('https://example.org/~john.doe') - {'@id': 'https://example.org/~john.doe'} - """ - if isinstance(s, str): - return {"@id": s} - - def normalize_keywords(self, l): - """https://docs.npmjs.com/files/package.json#homepage - - >>> NpmMapping().normalize_keywords(['foo', 'bar']) - ['foo', 'bar'] - """ - if isinstance(l, list): - return [x for x in l if isinstance(x, str)] - - -@register_mapping -class CodemetaMapping(SingleFileMapping): - """ - dedicated class for CodeMeta (codemeta.json) mapping and translation - """ - name = 'codemeta' - filename = b'codemeta.json' - string_fields = None - - @classmethod - def supported_terms(cls): - return [term for term in CODEMETA_TERMS if not term.startswith('@')] - - def translate(self, content): - try: - return self.normalize_translation(expand( - json.loads(content.decode()))) - except Exception: - return None - - -@register_mapping -class MavenMapping(DictMapping, SingleFileMapping): - """ - dedicated class for Maven (pom.xml) mapping and translation - """ - name = 'maven' - filename = b'pom.xml' - mapping = CROSSWALK_TABLE['Java (Maven)'] - string_fields = ['name', 'version', 'description', 'email'] - - def translate(self, content): - try: - d = xmltodict.parse(content).get('project') or {} - except xml.parsers.expat.ExpatError: - self.log.warning('Error parsing XML from %s', self.log_suffix) - return None - except UnicodeDecodeError: - self.log.warning('Error unidecoding XML from %s', self.log_suffix) - return None - except (LookupError, ValueError): - # unknown encoding or multi-byte encoding - self.log.warning('Error detecting XML encoding from %s', - self.log_suffix) - return None - metadata = self._translate_dict(d, normalize=False) - metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) - metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) - return self.normalize_translation(metadata) - - _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} - - def parse_repositories(self, d): - """https://maven.apache.org/pom.html#Repositories - - >>> import xmltodict - >>> from pprint import pprint - >>> d = xmltodict.parse(''' - ... - ... - ... codehausSnapshots - ... Codehaus Snapshots - ... http://snapshots.maven.codehaus.org/maven2 - ... default - ... - ... - ... ''') - >>> MavenMapping().parse_repositories(d) - """ - repositories = d.get('repositories') - if not repositories: - results = [self.parse_repository(d, self._default_repository)] - elif isinstance(repositories, dict): - repositories = repositories.get('repository') or [] - if not isinstance(repositories, list): - repositories = [repositories] - results = [self.parse_repository(d, repo) - for repo in repositories] - else: - results = [] - return [res for res in results if res] or None - - def parse_repository(self, d, repo): - if not isinstance(repo, dict): - return - if repo.get('layout', 'default') != 'default': - return # TODO ? - url = repo.get('url') - group_id = d.get('groupId') - artifact_id = d.get('artifactId') - if (isinstance(url, str) and isinstance(group_id, str) - and isinstance(artifact_id, str)): - repo = os.path.join(url, *group_id.split('.'), artifact_id) - return {"@id": repo} - - def normalize_groupId(self, id_): - """https://maven.apache.org/pom.html#Maven_Coordinates - - >>> MavenMapping().normalize_groupId('org.example') - {'@id': 'org.example'} - """ - if isinstance(id_, str): - return {"@id": id_} - - def parse_licenses(self, d): - """https://maven.apache.org/pom.html#Licenses - - >>> import xmltodict - >>> import json - >>> d = xmltodict.parse(''' - ... - ... - ... Apache License, Version 2.0 - ... https://www.apache.org/licenses/LICENSE-2.0.txt - ... - ... - ... ''') - >>> print(json.dumps(d, indent=4)) - { - "licenses": { - "license": { - "name": "Apache License, Version 2.0", - "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" - } - } - } - >>> MavenMapping().parse_licenses(d) - [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] - - or, if there are more than one license: - - >>> import xmltodict - >>> from pprint import pprint - >>> d = xmltodict.parse(''' - ... - ... - ... Apache License, Version 2.0 - ... https://www.apache.org/licenses/LICENSE-2.0.txt - ... - ... - ... MIT License - ... https://opensource.org/licenses/MIT - ... - ... - ... ''') - >>> pprint(MavenMapping().parse_licenses(d)) - [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, - {'@id': 'https://opensource.org/licenses/MIT'}] - """ - - licenses = d.get('licenses') - if not isinstance(licenses, dict): - return - licenses = licenses.get('license') - if isinstance(licenses, dict): - licenses = [licenses] - elif not isinstance(licenses, list): - return - return [{"@id": license['url']} - for license in licenses - if isinstance(license, dict) - and isinstance(license.get('url'), str)] or None - - -_normalize_pkginfo_key = str.lower - - -class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy): - def header_fetch_parse(self, name, value): - if hasattr(value, 'name'): - return value - value = value.replace('\n ', '\n') - return self.header_factory(name, value) - - -@register_mapping -class PythonPkginfoMapping(DictMapping, SingleFileMapping): - """Dedicated class for Python's PKG-INFO mapping and translation. - - https://www.python.org/dev/peps/pep-0314/""" - name = 'pkg-info' - filename = b'PKG-INFO' - mapping = {_normalize_pkginfo_key(k): v - for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} - string_fields = ['name', 'version', 'description', 'summary', - 'author', 'author-email'] - - _parser = email.parser.BytesHeaderParser( - policy=LinebreakPreservingEmailPolicy()) - - def translate(self, content): - msg = self._parser.parsebytes(content) - d = {} - for (key, value) in msg.items(): - key = _normalize_pkginfo_key(key) - if value != 'UNKNOWN': - d.setdefault(key, []).append(value) - metadata = self._translate_dict(d, normalize=False) - if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata: - metadata[SCHEMA_URI+'author'] = { - '@list': [{ - '@type': SCHEMA_URI+'Person', - SCHEMA_URI+'name': - metadata.pop(SCHEMA_URI+'author', [None])[0], - SCHEMA_URI+'email': - metadata.pop(SCHEMA_URI+'email', [None])[0], - }] - } - return self.normalize_translation(metadata) - - def normalize_home_page(self, urls): - return [{'@id': url} for url in urls] - - def normalize_keywords(self, keywords): - return list(itertools.chain.from_iterable( - s.split(' ') for s in keywords)) - - def normalize_license(self, licenses): - return [{'@id': license} for license in licenses] - - -@register_mapping -class GemspecMapping(DictMapping): - name = 'gemspec' - mapping = CROSSWALK_TABLE['Ruby Gem'] - string_fields = ['name', 'version', 'description', 'summary', 'email'] - - _re_spec_new = re.compile(r'.*Gem::Specification.new +(do|\{) +\|.*\|.*') - _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') - - @classmethod - def detect_metadata_files(cls, file_entries): - for entry in file_entries: - if entry['name'].endswith(b'.gemspec'): - return [entry['sha1']] - return [] - - def translate(self, raw_content): - try: - raw_content = raw_content.decode() - except UnicodeDecodeError: - self.log.warning('Error unidecoding from %s', self.log_suffix) - return - - # Skip lines before 'Gem::Specification.new' - lines = itertools.dropwhile( - lambda x: not self._re_spec_new.match(x), - raw_content.split('\n')) - - try: - next(lines) # Consume 'Gem::Specification.new' - except StopIteration: - self.log.warning('Could not find Gem::Specification in %s', - self.log_suffix) - return - - content_dict = {} - for line in lines: - match = self._re_spec_entry.match(line) - if match: - value = self.eval_ruby_expression(match.group('expr')) - if value: - content_dict[match.group('key')] = value - return self._translate_dict(content_dict) - - def eval_ruby_expression(self, expr): - """Very simple evaluator of Ruby expressions. - - >>> GemspecMapping().eval_ruby_expression('"Foo bar"') - 'Foo bar' - >>> GemspecMapping().eval_ruby_expression("'Foo bar'") - 'Foo bar' - >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']") - ['Foo', 'bar'] - >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze") - 'Foo bar' - >>> GemspecMapping().eval_ruby_expression( \ - "['Foo'.freeze, 'bar'.freeze]") - ['Foo', 'bar'] - """ - def evaluator(node): - if isinstance(node, ast.Str): - return node.s - elif isinstance(node, ast.List): - res = [] - for element in node.elts: - val = evaluator(element) - if not val: - return - res.append(val) - return res - - expr = expr.replace('.freeze', '') - try: - # We're parsing Ruby expressions here, but Python's - # ast.parse works for very simple Ruby expressions - # (mainly strings delimited with " or ', and lists - # of such strings). - tree = ast.parse(expr, mode='eval') - except (SyntaxError, ValueError): - return - if isinstance(tree, ast.Expression): - return evaluator(tree.body) - - def normalize_homepage(self, s): - if isinstance(s, str): - return {"@id": s} - - def normalize_license(self, s): - if isinstance(s, str): - return [{"@id": "https://spdx.org/licenses/" + s}] - - def normalize_licenses(self, licenses): - if isinstance(licenses, list): - return [{"@id": "https://spdx.org/licenses/" + license} - for license in licenses - if isinstance(license, str)] - - def normalize_author(self, author): - if isinstance(author, str): - return {"@list": [author]} - - def normalize_authors(self, authors): - if isinstance(authors, list): - return {"@list": [author for author in authors - if isinstance(author, str)]} - - -@click.command() -@click.argument('mapping_name') -@click.argument('file_name') -def main(mapping_name, file_name): - from pprint import pprint - with open(file_name, 'rb') as fd: - file_content = fd.read() - res = MAPPINGS[mapping_name]().translate(file_content) - pprint(res) - - -if __name__ == '__main__': - main() diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py new file mode 100644 index 0000000..107a8b3 --- /dev/null +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -0,0 +1,38 @@ +import collections + +import click + +from . import maven, npm, codemeta, python, ruby + +MAPPINGS = { + 'CodemetaMapping': codemeta.CodemetaMapping, + 'MavenMapping': maven.MavenMapping, + 'NpmMapping': npm.NpmMapping, + 'PythonPkginfoMapping': python.PythonPkginfoMapping, + 'GemspecMapping': ruby.GemspecMapping, +} + + +def list_terms(): + """Returns a dictionary with all supported CodeMeta terms as keys, + and the mappings that support each of them as values.""" + d = collections.defaultdict(set) + for mapping in MAPPINGS.values(): + for term in mapping.supported_terms(): + d[term].add(mapping) + return d + + +@click.command() +@click.argument('mapping_name') +@click.argument('file_name') +def main(mapping_name, file_name): + from pprint import pprint + with open(file_name, 'rb') as fd: + file_content = fd.read() + res = MAPPINGS[mapping_name]().translate(file_content) + pprint(res) + + +if __name__ == '__main__': + main() diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py new file mode 100644 index 0000000..9bc0ef5 --- /dev/null +++ b/swh/indexer/metadata_dictionary/base.py @@ -0,0 +1,211 @@ +# Copyright (C) 2017-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import json +import logging + +from swh.indexer.codemeta import SCHEMA_URI +from swh.indexer.codemeta import compact + + +def merge_values(v1, v2): + """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, + returns `{"@list": l1 + l2}`. + Otherwise, make them lists (if they are not already) and concatenate + them. + + >>> merge_values('a', 'b') + ['a', 'b'] + >>> merge_values(['a', 'b'], 'c') + ['a', 'b', 'c'] + >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) + {'@list': ['a', 'b', 'c']} + """ + if v1 is None: + return v2 + elif v2 is None: + return v1 + elif isinstance(v1, dict) and set(v1) == {'@list'}: + assert isinstance(v1['@list'], list) + if isinstance(v2, dict) and set(v2) == {'@list'}: + assert isinstance(v2['@list'], list) + return {'@list': v1['@list'] + v2['@list']} + else: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + else: + if isinstance(v2, dict) and '@list' in v2: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + if not isinstance(v1, list): + v1 = [v1] + if not isinstance(v2, list): + v2 = [v2] + return v1 + v2 + + +class BaseMapping(metaclass=abc.ABCMeta): + """Base class for mappings to inherit from + + To implement a new mapping: + + - inherit this class + - override translate function + """ + def __init__(self, log_suffix=''): + self.log_suffix = log_suffix + self.log = logging.getLogger('%s.%s' % ( + self.__class__.__module__, + self.__class__.__name__)) + + @property + @abc.abstractmethod + def name(self): + """A name of this mapping, used as an identifier in the + indexer storage.""" + pass + + @classmethod + @abc.abstractmethod + def detect_metadata_files(cls, files): + """ + Detects files potentially containing metadata + + Args: + file_entries (list): list of files + + Returns: + list: list of sha1 (possibly empty) + """ + pass + + @abc.abstractmethod + def translate(self, file_content): + pass + + def normalize_translation(self, metadata): + return compact(metadata) + + +class SingleFileMapping(BaseMapping): + """Base class for all mappings that use a single file as input.""" + + @property + @abc.abstractmethod + def filename(self): + """The .json file to extract metadata from.""" + pass + + @classmethod + def detect_metadata_files(cls, file_entries): + for entry in file_entries: + if entry['name'] == cls.filename: + return [entry['sha1']] + return [] + + +class DictMapping(BaseMapping): + """Base class for mappings that take as input a file that is mostly + a key-value store (eg. a shallow JSON dict).""" + + string_fields = [] + '''List of fields that are simple strings, and don't need any + normalization.''' + + @property + @abc.abstractmethod + def mapping(self): + """A translation dict to map dict keys into a canonical name.""" + pass + + @staticmethod + def _normalize_method_name(name): + return name.replace('-', '_') + + @classmethod + def supported_terms(cls): + return { + term for (key, term) in cls.mapping.items() + if key in cls.string_fields + or hasattr(cls, 'translate_' + cls._normalize_method_name(key)) + or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))} + + def _translate_dict(self, content_dict, *, normalize=True): + """ + Translates content by parsing content from a dict object + and translating with the appropriate mapping + + Args: + content_dict (dict): content dict to translate + + Returns: + dict: translated metadata in json-friendly form needed for + the indexer + + """ + translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} + for k, v in content_dict.items(): + # First, check if there is a specific translation + # method for this key + translation_method = getattr( + self, 'translate_' + self._normalize_method_name(k), None) + if translation_method: + translation_method(translated_metadata, v) + elif k in self.mapping: + # if there is no method, but the key is known from the + # crosswalk table + codemeta_key = self.mapping[k] + + # if there is a normalization method, use it on the value + normalization_method = getattr( + self, 'normalize_' + self._normalize_method_name(k), None) + if normalization_method: + v = normalization_method(v) + elif k in self.string_fields and isinstance(v, str): + pass + elif k in self.string_fields and isinstance(v, list): + v = [x for x in v if isinstance(x, str)] + else: + continue + + # set the translation metadata with the normalized value + if codemeta_key in translated_metadata: + translated_metadata[codemeta_key] = merge_values( + translated_metadata[codemeta_key], v) + else: + translated_metadata[codemeta_key] = v + if normalize: + return self.normalize_translation(translated_metadata) + else: + return translated_metadata + + +class JsonMapping(DictMapping, SingleFileMapping): + """Base class for all mappings that use a JSON file as input.""" + + def translate(self, raw_content): + """ + Translates content by parsing content from a bytestring containing + json data and translating with the appropriate mapping + + Args: + raw_content (bytes): raw content to translate + + Returns: + dict: translated metadata in json-friendly form needed for + the indexer + + """ + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding from %s', self.log_suffix) + return + try: + content_dict = json.loads(raw_content) + except json.JSONDecodeError: + self.log.warning('Error unjsoning from %s', self.log_suffix) + return + if isinstance(content_dict, dict): + return self._translate_dict(content_dict) diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py new file mode 100644 index 0000000..bfb336c --- /dev/null +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -0,0 +1,30 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +from swh.indexer.codemeta import CODEMETA_TERMS +from swh.indexer.codemeta import expand +from .base import SingleFileMapping + + +class CodemetaMapping(SingleFileMapping): + """ + dedicated class for CodeMeta (codemeta.json) mapping and translation + """ + name = 'codemeta' + filename = b'codemeta.json' + string_fields = None + + @classmethod + def supported_terms(cls): + return [term for term in CODEMETA_TERMS if not term.startswith('@')] + + def translate(self, content): + try: + return self.normalize_translation(expand( + json.loads(content.decode()))) + except Exception: + return None diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py new file mode 100644 index 0000000..38592ba --- /dev/null +++ b/swh/indexer/metadata_dictionary/maven.py @@ -0,0 +1,154 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import xml.parsers.expat + +import xmltodict + +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from .base import DictMapping, SingleFileMapping + + +class MavenMapping(DictMapping, SingleFileMapping): + """ + dedicated class for Maven (pom.xml) mapping and translation + """ + name = 'maven' + filename = b'pom.xml' + mapping = CROSSWALK_TABLE['Java (Maven)'] + string_fields = ['name', 'version', 'description', 'email'] + + def translate(self, content): + try: + d = xmltodict.parse(content).get('project') or {} + except xml.parsers.expat.ExpatError: + self.log.warning('Error parsing XML from %s', self.log_suffix) + return None + except UnicodeDecodeError: + self.log.warning('Error unidecoding XML from %s', self.log_suffix) + return None + except (LookupError, ValueError): + # unknown encoding or multi-byte encoding + self.log.warning('Error detecting XML encoding from %s', + self.log_suffix) + return None + metadata = self._translate_dict(d, normalize=False) + metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) + metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) + return self.normalize_translation(metadata) + + _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} + + def parse_repositories(self, d): + """https://maven.apache.org/pom.html#Repositories + + >>> import xmltodict + >>> from pprint import pprint + >>> d = xmltodict.parse(''' + ... + ... + ... codehausSnapshots + ... Codehaus Snapshots + ... http://snapshots.maven.codehaus.org/maven2 + ... default + ... + ... + ... ''') + >>> MavenMapping().parse_repositories(d) + """ + repositories = d.get('repositories') + if not repositories: + results = [self.parse_repository(d, self._default_repository)] + elif isinstance(repositories, dict): + repositories = repositories.get('repository') or [] + if not isinstance(repositories, list): + repositories = [repositories] + results = [self.parse_repository(d, repo) + for repo in repositories] + else: + results = [] + return [res for res in results if res] or None + + def parse_repository(self, d, repo): + if not isinstance(repo, dict): + return + if repo.get('layout', 'default') != 'default': + return # TODO ? + url = repo.get('url') + group_id = d.get('groupId') + artifact_id = d.get('artifactId') + if (isinstance(url, str) and isinstance(group_id, str) + and isinstance(artifact_id, str)): + repo = os.path.join(url, *group_id.split('.'), artifact_id) + return {"@id": repo} + + def normalize_groupId(self, id_): + """https://maven.apache.org/pom.html#Maven_Coordinates + + >>> MavenMapping().normalize_groupId('org.example') + {'@id': 'org.example'} + """ + if isinstance(id_, str): + return {"@id": id_} + + def parse_licenses(self, d): + """https://maven.apache.org/pom.html#Licenses + + >>> import xmltodict + >>> import json + >>> d = xmltodict.parse(''' + ... + ... + ... Apache License, Version 2.0 + ... https://www.apache.org/licenses/LICENSE-2.0.txt + ... + ... + ... ''') + >>> print(json.dumps(d, indent=4)) + { + "licenses": { + "license": { + "name": "Apache License, Version 2.0", + "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" + } + } + } + >>> MavenMapping().parse_licenses(d) + [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] + + or, if there are more than one license: + + >>> import xmltodict + >>> from pprint import pprint + >>> d = xmltodict.parse(''' + ... + ... + ... Apache License, Version 2.0 + ... https://www.apache.org/licenses/LICENSE-2.0.txt + ... + ... + ... MIT License + ... https://opensource.org/licenses/MIT + ... + ... + ... ''') + >>> pprint(MavenMapping().parse_licenses(d)) + [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, + {'@id': 'https://opensource.org/licenses/MIT'}] + """ + + licenses = d.get('licenses') + if not isinstance(licenses, dict): + return + licenses = licenses.get('license') + if isinstance(licenses, dict): + licenses = [licenses] + elif not isinstance(licenses, list): + return + return [{"@id": license['url']} + for license in licenses + if isinstance(license, dict) + and isinstance(license.get('url'), str)] or None diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py new file mode 100644 index 0000000..659fe77 --- /dev/null +++ b/swh/indexer/metadata_dictionary/npm.py @@ -0,0 +1,156 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re + +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from .base import JsonMapping + + +class NpmMapping(JsonMapping): + """ + dedicated class for NPM (package.json) mapping and translation + """ + name = 'npm' + mapping = CROSSWALK_TABLE['NodeJS'] + filename = b'package.json' + string_fields = ['name', 'version', 'homepage', 'description', 'email'] + + _schema_shortcuts = { + 'github': 'git+https://github.com/%s.git', + 'gist': 'git+https://gist.github.com/%s.git', + 'gitlab': 'git+https://gitlab.com/%s.git', + # Bitbucket supports both hg and git, and the shortcut does not + # tell which one to use. + # 'bitbucket': 'https://bitbucket.org/', + } + + def normalize_repository(self, d): + """https://docs.npmjs.com/files/package.json#repository + + >>> NpmMapping().normalize_repository({ + ... 'type': 'git', + ... 'url': 'https://example.org/foo.git' + ... }) + {'@id': 'git+https://example.org/foo.git'} + >>> NpmMapping().normalize_repository( + ... 'gitlab:foo/bar') + {'@id': 'git+https://gitlab.com/foo/bar.git'} + >>> NpmMapping().normalize_repository( + ... 'foo/bar') + {'@id': 'git+https://github.com/foo/bar.git'} + """ + if isinstance(d, dict) and isinstance(d.get('type'), str) \ + and isinstance(d.get('url'), str): + url = '{type}+{url}'.format(**d) + elif isinstance(d, str): + if '://' in d: + url = d + elif ':' in d: + (schema, rest) = d.split(':', 1) + if schema in self._schema_shortcuts: + url = self._schema_shortcuts[schema] % rest + else: + return None + else: + url = self._schema_shortcuts['github'] % d + + else: + return None + + return {'@id': url} + + def normalize_bugs(self, d): + """https://docs.npmjs.com/files/package.json#bugs + + >>> NpmMapping().normalize_bugs({ + ... 'url': 'https://example.org/bugs/', + ... 'email': 'bugs@example.org' + ... }) + {'@id': 'https://example.org/bugs/'} + >>> NpmMapping().normalize_bugs( + ... 'https://example.org/bugs/') + {'@id': 'https://example.org/bugs/'} + """ + if isinstance(d, dict) and isinstance(d.get('url'), str): + return {'@id': d['url']} + elif isinstance(d, str): + return {'@id': d} + else: + return None + + _parse_author = re.compile(r'^ *' + r'(?P.*?)' + r'( +<(?P.*)>)?' + r'( +\((?P.*)\))?' + r' *$') + + def normalize_author(self, d): + """https://docs.npmjs.com/files/package.json#people-fields-author-contributors' + + >>> from pprint import pprint + >>> pprint(NpmMapping().normalize_author({ + ... 'name': 'John Doe', + ... 'email': 'john.doe@example.org', + ... 'url': 'https://example.org/~john.doe', + ... })) + {'@list': [{'@type': 'http://schema.org/Person', + 'http://schema.org/email': 'john.doe@example.org', + 'http://schema.org/name': 'John Doe', + 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]} + >>> pprint(NpmMapping().normalize_author( + ... 'John Doe (https://example.org/~john.doe)' + ... )) + {'@list': [{'@type': 'http://schema.org/Person', + 'http://schema.org/email': 'john.doe@example.org', + 'http://schema.org/name': 'John Doe', + 'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]} + """ # noqa + author = {'@type': SCHEMA_URI+'Person'} + if isinstance(d, dict): + name = d.get('name', None) + email = d.get('email', None) + url = d.get('url', None) + elif isinstance(d, str): + match = self._parse_author.match(d) + name = match.group('name') + email = match.group('email') + url = match.group('url') + else: + return None + if name and isinstance(name, str): + author[SCHEMA_URI+'name'] = name + if email and isinstance(email, str): + author[SCHEMA_URI+'email'] = email + if url and isinstance(url, str): + author[SCHEMA_URI+'url'] = {'@id': url} + return {"@list": [author]} + + def normalize_license(self, s): + """https://docs.npmjs.com/files/package.json#license + + >>> NpmMapping().normalize_license('MIT') + {'@id': 'https://spdx.org/licenses/MIT'} + """ + if isinstance(s, str): + return {"@id": "https://spdx.org/licenses/" + s} + + def normalize_homepage(self, s): + """https://docs.npmjs.com/files/package.json#homepage + + >>> NpmMapping().normalize_homepage('https://example.org/~john.doe') + {'@id': 'https://example.org/~john.doe'} + """ + if isinstance(s, str): + return {"@id": s} + + def normalize_keywords(self, l): + """https://docs.npmjs.com/files/package.json#homepage + + >>> NpmMapping().normalize_keywords(['foo', 'bar']) + ['foo', 'bar'] + """ + if isinstance(l, list): + return [x for x in l if isinstance(x, str)] diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py new file mode 100644 index 0000000..4bcb81b --- /dev/null +++ b/swh/indexer/metadata_dictionary/python.py @@ -0,0 +1,67 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import email.parser +import email.policy +import itertools + +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from .base import DictMapping, SingleFileMapping + + +_normalize_pkginfo_key = str.lower + + +class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy): + def header_fetch_parse(self, name, value): + if hasattr(value, 'name'): + return value + value = value.replace('\n ', '\n') + return self.header_factory(name, value) + + +class PythonPkginfoMapping(DictMapping, SingleFileMapping): + """Dedicated class for Python's PKG-INFO mapping and translation. + + https://www.python.org/dev/peps/pep-0314/""" + name = 'pkg-info' + filename = b'PKG-INFO' + mapping = {_normalize_pkginfo_key(k): v + for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} + string_fields = ['name', 'version', 'description', 'summary', + 'author', 'author-email'] + + _parser = email.parser.BytesHeaderParser( + policy=LinebreakPreservingEmailPolicy()) + + def translate(self, content): + msg = self._parser.parsebytes(content) + d = {} + for (key, value) in msg.items(): + key = _normalize_pkginfo_key(key) + if value != 'UNKNOWN': + d.setdefault(key, []).append(value) + metadata = self._translate_dict(d, normalize=False) + if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata: + metadata[SCHEMA_URI+'author'] = { + '@list': [{ + '@type': SCHEMA_URI+'Person', + SCHEMA_URI+'name': + metadata.pop(SCHEMA_URI+'author', [None])[0], + SCHEMA_URI+'email': + metadata.pop(SCHEMA_URI+'email', [None])[0], + }] + } + return self.normalize_translation(metadata) + + def normalize_home_page(self, urls): + return [{'@id': url} for url in urls] + + def normalize_keywords(self, keywords): + return list(itertools.chain.from_iterable( + s.split(' ') for s in keywords)) + + def normalize_license(self, licenses): + return [{'@id': license} for license in licenses] diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py new file mode 100644 index 0000000..8d5b4a7 --- /dev/null +++ b/swh/indexer/metadata_dictionary/ruby.py @@ -0,0 +1,117 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import ast +import itertools +import re + +from swh.indexer.codemeta import CROSSWALK_TABLE +from .base import DictMapping + + +class GemspecMapping(DictMapping): + name = 'gemspec' + mapping = CROSSWALK_TABLE['Ruby Gem'] + string_fields = ['name', 'version', 'description', 'summary', 'email'] + + _re_spec_new = re.compile(r'.*Gem::Specification.new +(do|\{) +\|.*\|.*') + _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') + + @classmethod + def detect_metadata_files(cls, file_entries): + for entry in file_entries: + if entry['name'].endswith(b'.gemspec'): + return [entry['sha1']] + return [] + + def translate(self, raw_content): + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding from %s', self.log_suffix) + return + + # Skip lines before 'Gem::Specification.new' + lines = itertools.dropwhile( + lambda x: not self._re_spec_new.match(x), + raw_content.split('\n')) + + try: + next(lines) # Consume 'Gem::Specification.new' + except StopIteration: + self.log.warning('Could not find Gem::Specification in %s', + self.log_suffix) + return + + content_dict = {} + for line in lines: + match = self._re_spec_entry.match(line) + if match: + value = self.eval_ruby_expression(match.group('expr')) + if value: + content_dict[match.group('key')] = value + return self._translate_dict(content_dict) + + def eval_ruby_expression(self, expr): + """Very simple evaluator of Ruby expressions. + + >>> GemspecMapping().eval_ruby_expression('"Foo bar"') + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression("'Foo bar'") + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']") + ['Foo', 'bar'] + >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze") + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression( \ + "['Foo'.freeze, 'bar'.freeze]") + ['Foo', 'bar'] + """ + def evaluator(node): + if isinstance(node, ast.Str): + return node.s + elif isinstance(node, ast.List): + res = [] + for element in node.elts: + val = evaluator(element) + if not val: + return + res.append(val) + return res + + expr = expr.replace('.freeze', '') + try: + # We're parsing Ruby expressions here, but Python's + # ast.parse works for very simple Ruby expressions + # (mainly strings delimited with " or ', and lists + # of such strings). + tree = ast.parse(expr, mode='eval') + except (SyntaxError, ValueError): + return + if isinstance(tree, ast.Expression): + return evaluator(tree.body) + + def normalize_homepage(self, s): + if isinstance(s, str): + return {"@id": s} + + def normalize_license(self, s): + if isinstance(s, str): + return [{"@id": "https://spdx.org/licenses/" + s}] + + def normalize_licenses(self, licenses): + if isinstance(licenses, list): + return [{"@id": "https://spdx.org/licenses/" + license} + for license in licenses + if isinstance(license, str)] + + def normalize_author(self, author): + if isinstance(author, str): + return {"@list": [author]} + + def normalize_authors(self, authors): + if isinstance(authors, list): + return {"@list": [author for author in authors + if isinstance(author, str)]} diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 2b8d651..e20e1e9 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,1209 +1,1209 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import unittest from hypothesis import given, strategies, settings, HealthCheck import xmltodict from swh.model.hashutil import hash_to_bytes -from swh.indexer.codemeta import CODEMETA_TERMS -from swh.indexer.metadata_dictionary import ( - CROSSWALK_TABLE, MAPPINGS, merge_values) +from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE +from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer.metadata_dictionary.base import merge_values from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) from swh.indexer.metadata import ( ContentMetadataIndexer, RevisionMetadataIndexer ) from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage, YARN_PARSER_METADATA, json_document_strategy ) TRANSLATOR_TOOL = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, 'should not be called; the rev indexer configures it.' REVISION_METADATA_CONFIG = { **BASE_TEST_CONFIG, 'tools': TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.npm_mapping = MAPPINGS['NpmMapping']() self.codemeta_mapping = MAPPINGS['CodemetaMapping']() self.maven_mapping = MAPPINGS['MavenMapping']() self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']() self.gemspec_mapping = MAPPINGS['GemspecMapping']() def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { 'repository': 'http://schema.org/codeRepository', 'os': 'http://schema.org/operatingSystem', 'cpu': 'http://schema.org/processorRequirements', 'engines': 'http://schema.org/processorRequirements', 'author': 'http://schema.org/author', 'author.email': 'http://schema.org/email', 'author.name': 'http://schema.org/name', 'contributor': 'http://schema.org/contributor', 'keywords': 'http://schema.org/keywords', 'license': 'http://schema.org/license', 'version': 'http://schema.org/version', 'description': 'http://schema.org/description', 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'http://schema.org/url' }) def test_merge_values(self): self.assertEqual( merge_values('a', 'b'), ['a', 'b']) self.assertEqual( merge_values(['a', 'b'], 'c'), ['a', 'b', 'c']) self.assertEqual( merge_values('a', ['b', 'c']), ['a', 'b', 'c']) self.assertEqual( merge_values({'@list': ['a']}, {'@list': ['b']}), {'@list': ['a', 'b']}) self.assertEqual( merge_values({'@list': ['a', 'b']}, {'@list': ['c']}), {'@list': ['a', 'b', 'c']}) with self.assertRaises(ValueError): merge_values({'@list': ['a']}, 'b') with self.assertRaises(ValueError): merge_values('a', {'@list': ['b']}) with self.assertRaises(ValueError): merge_values({'@list': ['a']}, ['b']) with self.assertRaises(ValueError): merge_values(['a'], {'@list': ['b']}) self.assertEqual( merge_values('a', None), 'a') self.assertEqual( merge_values(['a', 'b'], None), ['a', 'b']) self.assertEqual( merge_values(None, ['b', 'c']), ['b', 'c']) self.assertEqual( merge_values({'@list': ['a']}, None), {'@list': ['a']}) self.assertEqual( merge_values(None, {'@list': ['a']}), {'@list': ['a']}) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'test_metadata', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'author': [{ 'type': 'Person', 'name': 'Morane G', 'email': 'moranegg@example.com', }], } # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', 'author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "version": '0.0.2', "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], "author": ['moranegg'], "codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config['tools'] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = list(metadata_indexer.idx_storage.content_metadata_get( sha1s)) expected_results = [{ 'metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), }, { 'metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/npm/npm/issues', 'author': [{ 'type': 'Person', 'name': 'Isaac Z. Schlueter', 'email': 'i@izs.me', 'url': 'http://blog.izs.me', }], 'codeRepository': 'git+https://github.com/npm/npm', 'description': 'a package manager for JavaScript', 'license': 'https://spdx.org/licenses/Artistic-2.0', 'version': '5.0.3', 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') }] for result in results: del result['tool'] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'issueTracker': 'https://github.com/owner/project/issues', 'type': 'SoftwareSourceCode', }) # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'type': 'SoftwareSourceCode', }) # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'issueTracker': 'https://github.com/owner/project/issues', 'type': 'SoftwareSourceCode', }) def test_npm_repository_normalization(self): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://github.com/npm/cli.git', 'type': 'SoftwareSourceCode', }) # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'type': 'SoftwareSourceCode', }) # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = self.npm_mapping.translate(package_json) expected_result = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://github.com/npm/cli.git', 'type': 'SoftwareSourceCode', } self.assertEqual(result, expected_result) # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://gitlab.com/user/repo.git', 'type': 'SoftwareSourceCode', }) def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = ( b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation" }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": [ "metadata", "software" ], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD" } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_codemeta_alternate_context(self): raw_content = ( b"""{ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", "@type": "SoftwareSourceCode", "identifier": "CodeMeta" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'codeRepository': 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_compute_metadata_maven_almost_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_compute_metadata_maven_invalid_xml(self): expected_warning = ( - 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:' 'Error parsing XML from foo') raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_unknown_encoding(self): expected_warning = ( - 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:' 'Error detecting XML encoding from foo') raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_invalid_encoding(self): expected_warning = ( - 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:' 'Error unidecoding XML from foo') raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', level='WARNING') as cm: result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_empty_nodes(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) raw_content = b""" 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) raw_content = b""" 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'version': '1.2.3', }) def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 foo """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_multiple(self): '''Tests when there are multiple code repos and licenses.''' raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': [ 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'https://opensource.org/licenses/MIT', ], 'codeRepository': [ 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', 'http://example.org/maven2/com/mycompany/app/my-app', ] }) def test_compute_metadata_pkginfo(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== \x20 core library for swh's modules: - config parser - hash computations - serialization - logging mechanism \x20 Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """) # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertCountEqual(result['description'], [ 'Software Heritage core utilities', # note the comma here 'swh-core\n' '========\n' '\n' "core library for swh's modules:\n" '- config parser\n' '- hash computations\n' '- serialization\n' '- logging mechanism\n' ''], result) del result['description'] self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'url': 'https://forge.softwareheritage.org/diffusion/DCORE/', 'name': 'swh.core', 'author': [{ 'type': 'Person', 'name': 'Software Heritage developers', 'email': 'swh-devel@inria.fr', }], 'version': '0.0.49', }) def test_compute_metadata_pkginfo_utf8(self): raw_content = (b'''\ Metadata-Version: 1.1 Name: snowpyt Description-Content-Type: UNKNOWN Description: foo Hydrology N\xc2\xb083 ''') # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'snowpyt', 'description': 'foo\nHydrology N°83', }) def test_compute_metadata_pkginfo_keywords(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: foo Keywords: foo bar baz """) # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'foo', 'keywords': ['foo', 'bar', 'baz'], }) def test_compute_metadata_pkginfo_license(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: foo License: MIT """) # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'foo', 'license': 'MIT', }) def test_gemspec_base(self): raw_content = b""" Gem::Specification.new do |s| s.name = 'example' s.version = '0.1.0' s.licenses = ['MIT'] s.summary = "This is an example!" s.description = "Much longer explanation of the example!" s.authors = ["Ruby Coder"] s.email = 'rubycoder@example.com' s.files = ["lib/example.rb"] s.homepage = 'https://rubygems.org/gems/example' s.metadata = { "source_code_uri" => "https://github.com/example/example" } end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual(result.pop('description'), [ "This is an example!", "Much longer explanation of the example!" ]) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'author': ['Ruby Coder'], 'name': 'example', 'license': 'https://spdx.org/licenses/MIT', 'codeRepository': 'https://rubygems.org/gems/example', 'email': 'rubycoder@example.com', 'version': '0.1.0', }) def test_gemspec_two_author_fields(self): raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1"] s.author = "Ruby Coder2" end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual(result.pop('author'), [ 'Ruby Coder1', 'Ruby Coder2']) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_gemspec_invalid_author(self): raw_content = b""" Gem::Specification.new do |s| s.author = ["Ruby Coder"] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) raw_content = b""" Gem::Specification.new do |s| s.author = "Ruby Coder1", end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1", ["Ruby Coder2"]] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'author': ['Ruby Coder1'], }) def test_gemspec_alternative_header(self): raw_content = b""" require './lib/version' Gem::Specification.new { |s| s.name = 'rb-system-with-aliases' s.summary = 'execute system commands with aliases' } """ result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'rb-system-with-aliases', 'description': 'execute system commands with aliases', }) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy( keys=list(MAPPINGS['NpmMapping'].mapping))) def test_npm_adversarial(self, doc): raw = json.dumps(doc).encode() self.npm_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(self, doc): raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy( keys=list(MAPPINGS['MavenMapping'].mapping))) def test_maven_adversarial(self, doc): raw = xmltodict.unparse({'project': doc}, pretty=True) self.maven_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(strategies.dictionaries( # keys strategies.one_of( strategies.text(), *map(strategies.just, MAPPINGS['GemspecMapping'].mapping) ), # values strategies.recursive( strategies.characters(), lambda children: strategies.lists(children, 1) ) )) def test_gemspec_adversarial(self, doc): parts = [b'Gem::Specification.new do |s|\n'] for (k, v) in doc.items(): parts.append(' s.{} = {}\n'.format(k, repr(v)).encode()) parts.append(b'end\n') self.gemspec_mapping.translate(b''.join(parts)) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataIndexer( config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'metadata': YARN_PARSER_METADATA, }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list( metadata_indexer.idx_storage. revision_intrinsic_metadata_get(sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results) def test_revision_metadata_indexer_single_root_dir(self): metadata_indexer = RevisionMetadataIndexer( config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the revision rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') subdir_id = metadata_indexer.storage._revisions[rev_id]['directory'] metadata_indexer.storage._revisions[rev_id]['directory'] = b'123456' metadata_indexer.storage.directory_add([{ 'id': b'123456', 'entries': [{ 'target': subdir_id, 'type': 'dir', 'length': None, 'name': b'foobar-1.0.0', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None }], }]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'metadata': YARN_PARSER_METADATA, }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list( metadata_indexer.idx_storage. revision_intrinsic_metadata_get(sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 3971fdb..a5be367 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,217 +1,217 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata import OriginMetadataIndexer from .utils import YARN_PARSER_METADATA from .test_metadata import REVISION_METADATA_CONFIG def test_origin_metadata_indexer( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { 'id': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { 'id': origin['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) for result in results: del result['tool'] assert results == [origin_metadata] def test_origin_metadata_indexer_duplicate_origin( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["git+https://github.com/librariesio/yarn-parser"]) indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert len(results) == 1 def test_origin_metadata_indexer_missing_head( idx_storage, storage, obj_storage): storage.origin_add([{ 'type': 'git', 'url': 'https://example.com' }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://example.com"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://example.com'}) results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] def test_origin_metadata_indexer_partial_missing_head( idx_storage, storage, obj_storage): storage.origin_add([{ 'type': 'git', 'url': 'https://example.com' }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://example.com", "git+https://github.com/librariesio/yarn-parser"]) origin1 = storage.origin_get({ 'type': 'git', 'url': 'https://example.com'}) origin2 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { 'id': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { 'id': origin2['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin1['id'], origin2['id']])) for result in results: del result['tool'] assert results == [origin_metadata] def test_origin_metadata_indexer_duplicate_revision( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["git+https://github.com/librariesio/yarn-parser", "git+https://github.com/librariesio/yarn-parser.git"]) origin1 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) origin2 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser.git'}) assert origin1['id'] != origin2['id'] rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin1['id'], origin2['id']])) assert len(results) == 2 def test_origin_metadata_indexer_no_metadata( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - with patch('swh.indexer.metadata_dictionary.NpmMapping.filename', + with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] def test_origin_metadata_indexer_delete_metadata( idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results != [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results != [] - with patch('swh.indexer.metadata_dictionary.NpmMapping.filename', + with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): indexer.run(["git+https://github.com/librariesio/yarn-parser"]) results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == []