diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index f4a6edc..300fa46 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,352 +1,405 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import re import abc import json import logging +import email.parser + import xmltodict from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI from swh.indexer.codemeta import compact, expand MAPPINGS = {} def register_mapping(cls): MAPPINGS[cls.__name__] = cls() return cls class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - override translate function """ def __init__(self): self.log = logging.getLogger('%s.%s' % ( self.__class__.__module__, self.__class__.__name__)) @abc.abstractmethod def detect_metadata_files(self, files): """ Detects files potentially containing metadata Args: file_entries (list): list of files Returns: list: list of sha1 (possibly empty) """ pass @abc.abstractmethod def translate(self, file_content): pass def normalize_translation(self, metadata): return compact(metadata) class SingleFileMapping(BaseMapping): """Base class for all mappings that use a single file as input.""" @property @abc.abstractmethod def filename(self): """The .json file to extract metadata from.""" pass def detect_metadata_files(self, file_entries): for entry in file_entries: if entry['name'] == self.filename: return [entry['sha1']] return [] class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" @property @abc.abstractmethod def mapping(self): """A translation dict to map dict keys into a canonical name.""" pass def translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object and translating with the appropriate mapping Args: content_dict (dict): content dict to translate Returns: dict: translated metadata in json-friendly form needed for the indexer """ translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key - translation_method = getattr(self, 'translate_' + k, None) + translation_method = getattr( + self, 'translate_' + k.replace('-', '_'), None) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table # if there is a normalization method, use it on the value - normalization_method = getattr(self, 'normalize_' + k, None) + normalization_method = getattr( + self, 'normalize_' + k.replace('-', '_'), None) if normalization_method: v = normalization_method(v) # set the translation metadata with the normalized value translated_metadata[self.mapping[k]] = v if normalize: return self.normalize_translation(translated_metadata) else: return translated_metadata class JsonMapping(DictMapping, SingleFileMapping): """Base class for all mappings that use a JSON file as input.""" def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing json data and translating with the appropriate mapping Args: raw_content (bytes): raw content to translate Returns: dict: translated metadata in json-friendly form needed for the indexer """ try: raw_content = raw_content.decode() except UnicodeDecodeError: self.log.warning('Error unidecoding %r', raw_content) return try: content_dict = json.loads(raw_content) except json.JSONDecodeError: self.log.warning('Error unjsoning %r' % raw_content) return return self.translate_dict(content_dict) @register_mapping class NpmMapping(JsonMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' _schema_shortcuts = { 'github': 'https://github.com/', 'gist': 'https://gist.github.com/', 'bitbucket': 'https://bitbucket.org/', 'gitlab': 'https://gitlab.com/', } def normalize_repository(self, d): """https://docs.npmjs.com/files/package.json#repository""" if isinstance(d, dict): url = '{type}+{url}'.format(**d) elif isinstance(d, str): if '://' in d: url = d elif ':' in d: (schema, rest) = d.split(':', 1) if schema in self._schema_shortcuts: url = self._schema_shortcuts[schema] + rest else: return None else: url = self._schema_shortcuts['github'] + d else: return None return {'@id': url} def normalize_bugs(self, d): return {'@id': '{url}'.format(**d)} _parse_author = re.compile(r'^ *' r'(?P.*?)' r'( +<(?P.*)>)?' r'( +\((?P.*)\))?' r' *$') def normalize_author(self, d): 'https://docs.npmjs.com/files/package.json' \ '#people-fields-author-contributors' author = {'@type': SCHEMA_URI+'Person'} if isinstance(d, dict): name = d.get('name', None) email = d.get('email', None) url = d.get('url', None) elif isinstance(d, str): match = self._parse_author.match(d) name = match.group('name') email = match.group('email') url = match.group('url') else: return None if name: author[SCHEMA_URI+'name'] = name if email: author[SCHEMA_URI+'email'] = email if url: author[SCHEMA_URI+'url'] = {'@id': url} return {"@list": [author]} def normalize_license(self, s): return {"@id": "https://spdx.org/licenses/" + s} def normalize_homepage(self, s): return {"@id": s} @register_mapping class CodemetaMapping(SingleFileMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ filename = b'codemeta.json' def translate(self, content): return self.normalize_translation(expand(json.loads(content.decode()))) @register_mapping class MavenMapping(DictMapping, SingleFileMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ filename = b'pom.xml' mapping = CROSSWALK_TABLE['Java (Maven)'] def translate(self, content): d = xmltodict.parse(content)['project'] metadata = self.translate_dict(d, normalize=False) metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) return self.normalize_translation(metadata) _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} def parse_repositories(self, d): """https://maven.apache.org/pom.html#Repositories""" if 'repositories' not in d: return [self.parse_repository(d, self._default_repository)] else: repositories = d['repositories'].get('repository', []) if not isinstance(repositories, list): repositories = [repositories] results = [] for repo in repositories: res = self.parse_repository(d, repo) if res: results.append(res) return results def parse_repository(self, d, repo): if repo.get('layout', 'default') != 'default': return # TODO ? url = repo['url'] if d['groupId']: url = os.path.join(url, *d['groupId'].split('.')) if d['artifactId']: url = os.path.join(url, d['artifactId']) return {"@id": url} def normalize_groupId(self, id_): return {"@id": id_} def parse_licenses(self, d): """https://maven.apache.org/pom.html#Licenses The origin XML has the form: Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt Which was translated to a dict by xmltodict and is given as `d`: >>> d = { ... # ... ... "licenses": { ... "license": { ... "name": "Apache License, Version 2.0", ... "url": ... "https://www.apache.org/licenses/LICENSE-2.0.txt" ... } ... } ... } >>> MavenMapping().parse_licenses(d) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] or, if there are more than one license: >>> from pprint import pprint >>> d = { ... # ... ... "licenses": { ... "license": [ ... { ... "name": "Apache License, Version 2.0", ... "url": ... "https://www.apache.org/licenses/LICENSE-2.0.txt" ... }, ... { ... "name": "MIT License, ", ... "url": "https://opensource.org/licenses/MIT" ... } ... ] ... } ... } >>> pprint(MavenMapping().parse_licenses(d)) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, {'@id': 'https://opensource.org/licenses/MIT'}] """ licenses = d.get('licenses', {}).get('license', []) if isinstance(licenses, dict): licenses = [licenses] return [{"@id": license['url']} for license in licenses] +_normalize_pkginfo_key = str.lower + + +@register_mapping +class PythonPkginfoMapping(DictMapping, SingleFileMapping): + """Dedicated class for Python's PKG-INFO mapping and translation. + + https://www.python.org/dev/peps/pep-0314/""" + filename = b'PKG-INFO' + mapping = {_normalize_pkginfo_key(k): v + for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} + + _parser = email.parser.BytesHeaderParser() + + def translate(self, content): + msg = self._parser.parsebytes(content) + d = {} + for (key, value) in msg.items(): + key = _normalize_pkginfo_key(key) + if value != 'UNKNOWN': + d.setdefault(key, []).append(value) + metadata = self.translate_dict(d, normalize=False) + if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata: + metadata[SCHEMA_URI+'author'] = { + '@list': [{ + '@type': SCHEMA_URI+'Person', + SCHEMA_URI+'name': + metadata.pop(SCHEMA_URI+'author', [None])[0], + SCHEMA_URI+'email': + metadata.pop(SCHEMA_URI+'email', [None])[0], + }] + } + return self.normalize_translation(metadata) + + def translate_summary(self, translated_metadata, v): + k = self.mapping['summary'] + translated_metadata.setdefault(k, []).append(v) + + def translate_description(self, translated_metadata, v): + k = self.mapping['description'] + translated_metadata.setdefault(k, []).append(v) + + def normalize_home_page(self, urls): + return [{'@id': url} for url in urls] + + def normalize_license(self, licenses): + return [{'@id': license} for license in licenses] + + def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" result = MAPPINGS["NpmMapping"].translate(raw_content) result1 = MAPPINGS["MavenMapping"].translate(raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index bcc23aa..85630d9 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,585 +1,657 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) from swh.indexer.metadata import ( ContentMetadataIndexer, RevisionMetadataIndexer ) from .test_utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage ) TRANSLATOR_TOOL = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, 'should not be called; the rev indexer configures it.' class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = ContentMetadataTestIndexer def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { 'repository': 'http://schema.org/codeRepository', 'os': 'http://schema.org/operatingSystem', 'cpu': 'http://schema.org/processorRequirements', 'engines': 'http://schema.org/processorRequirements', 'author': 'http://schema.org/author', 'author.email': 'http://schema.org/email', 'author.name': 'http://schema.org/name', 'contributor': 'http://schema.org/contributor', 'keywords': 'http://schema.org/keywords', 'license': 'http://schema.org/license', 'version': 'http://schema.org/version', 'description': 'http://schema.org/description', 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'http://schema.org/url' }) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'test_metadata', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'author': [{ 'type': 'Person', 'name': 'Morane G', 'email': 'moranegg@example.com', }], } # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', 'author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "version": '0.0.2', "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], "author": ['moranegg'], "codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = ContentMetadataTestIndexer( tool=TRANSLATOR_TOOL, config=BASE_TEST_CONFIG.copy()) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = list(metadata_indexer.idx_storage.content_metadata_get( sha1s)) expected_results = [{ 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5') }, { 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/npm/npm/issues', 'author': [{ 'type': 'Person', 'name': 'Isaac Z. Schlueter', 'email': 'i@izs.me', 'url': 'http://blog.izs.me', }], 'codeRepository': 'git+https://github.com/npm/npm', 'description': 'a package manager for JavaScript', 'license': 'https://spdx.org/licenses/Artistic-2.0', 'version': '5.0.3', 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') }, { 'translated_metadata': None, 'id': hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069') }] for result in results: del result['tool'] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = ( b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation" }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": [ "metadata", "software" ], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD" } result = MAPPINGS["CodemetaMapping"].translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'codeRepository': 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', 'license': [], }) def test_compute_metadata_maven_multiple(self): '''Tests when there are multiple code repos and licenses.''' raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': [ 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'https://opensource.org/licenses/MIT', ], 'codeRepository': [ 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', 'http://example.org/maven2/com/mycompany/app/my-app', ] }) + def test_compute_metadata_pkginfo(self): + raw_content = (b"""\ +Metadata-Version: 2.1 +Name: swh.core +Version: 0.0.49 +Summary: Software Heritage core utilities +Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ +Author: Software Heritage developers +Author-email: swh-devel@inria.fr +License: UNKNOWN +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate +Project-URL: Source, https://forge.softwareheritage.org/source/swh-core +Description: swh-core + ======== + + core library for swh's modules: + - config parser + - hash computations + - serialization + - logging mechanism + +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) +Classifier: Operating System :: OS Independent +Classifier: Development Status :: 5 - Production/Stable +Description-Content-Type: text/markdown +Provides-Extra: testing +""") # noqa + result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) + self.assertCountEqual(result['description'], [ + 'Software Heritage core utilities', # note the comma here + 'swh-core\n' + ' ========\n' + ' \n' + " core library for swh's modules:\n" + ' - config parser\n' + ' - hash computations\n' + ' - serialization\n' + ' - logging mechanism\n' + ' '], + result) + del result['description'] + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'url': 'https://forge.softwareheritage.org/diffusion/DCORE/', + 'name': 'swh.core', + 'author': [{ + 'type': 'Person', + 'name': 'Software Heritage developers', + 'email': 'swh-devel@inria.fr', + }], + 'version': '0.0.49', + }) + + def test_compute_metadata_pkginfo_license(self): + raw_content = (b"""\ +Metadata-Version: 2.1 +Name: foo +License: MIT +""") # noqa + result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'name': 'foo', + 'license': 'MIT', + }) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'author': ['Andrew Nesbitt'], 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list(metadata_indexer.idx_storage.revision_metadata_get( sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'author': ['Andrew Nesbitt'], 'license': 'AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results)