diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ chardet file_magic pyld +xmltodict diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -27,14 +27,32 @@ PROPERTY_BLACKLIST = { # CodeMeta properties that we cannot properly represent. - CODEMETA_URI + 'softwareRequirements', + SCHEMA_URI + 'softwareRequirements', CODEMETA_URI + 'softwareSuggestions', # Duplicate of 'author' - CODEMETA_URI + 'creator', + SCHEMA_URI + 'creator', } +def make_absolute_uri(local_name): + definition = CODEMETA_CONTEXT['@context'][local_name] + if isinstance(definition, str): + return definition + elif isinstance(definition, dict): + prefixed_name = definition['@id'] + (prefix, local_name) = prefixed_name.split(':') + if prefix == 'schema': + canonical_name = SCHEMA_URI + local_name + elif prefix == 'codemeta': + canonical_name = CODEMETA_URI + local_name + else: + assert False, prefix + return canonical_name + else: + assert False, definition + + def _read_crosstable(fd): reader = csv.reader(fd) try: @@ -49,7 +67,10 @@ codemeta_translation = {data_source: {} for data_source in data_sources} for line in reader: # For each canonical name - canonical_name = CODEMETA_URI + dict(zip(header, line))['Property'] + local_name = dict(zip(header, line))['Property'] + if not local_name: + continue + canonical_name = make_absolute_uri(local_name) if canonical_name in PROPERTY_BLACKLIST: continue for (col, value) in zip(header, line): # For each cell in the row diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -3,7 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.indexer.codemeta import compact, expand, CODEMETA_URI +from swh.indexer.codemeta import compact, expand +from swh.indexer.codemeta import make_absolute_uri from swh.indexer.metadata_dictionary import MAPPINGS @@ -31,7 +32,8 @@ "url", "license", "maintainer", "email", "identifier", "codeRepository"} -MINIMAL_METADATA_SET = {CODEMETA_URI+prop for prop in _MINIMAL_PROPERTY_SET} +MINIMAL_METADATA_SET = {make_absolute_uri(prop) + for prop in _MINIMAL_PROPERTY_SET} def extract_minimal_metadata_dict(metadata_list): diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -3,12 +3,15 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os import re import abc import json import logging +import xmltodict -from swh.indexer.codemeta import CROSSWALK_TABLE, CODEMETA_URI, compact +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import compact, expand MAPPINGS = {} @@ -53,6 +56,22 @@ return compact(metadata) +class SingleFileMapping(BaseMapping): + """Base class for all mappings that use a single file as input.""" + + @property + @abc.abstractmethod + def filename(self): + """The .json file to extract metadata from.""" + pass + + def detect_metadata_files(self, file_entries): + for entry in file_entries: + if entry['name'] == self.filename: + return [entry['sha1']] + return [] + + class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" @@ -63,7 +82,7 @@ """A translation dict to map dict keys into a canonical name.""" pass - def translate_dict(self, content_dict): + def translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object and translating with the appropriate mapping @@ -76,7 +95,7 @@ the indexer """ - translated_metadata = {} + translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key @@ -94,24 +113,15 @@ # set the translation metadata with the normalized value translated_metadata[self.mapping[k]] = v - return self.normalize_translation(translated_metadata) + if normalize: + return self.normalize_translation(translated_metadata) + else: + return translated_metadata -class JsonMapping(DictMapping): +class JsonMapping(DictMapping, SingleFileMapping): """Base class for all mappings that use a JSON file as input.""" - @property - @abc.abstractmethod - def filename(self): - """The .json file to extract metadata from.""" - pass - - def detect_metadata_files(self, file_entries): - for entry in file_entries: - if entry['name'] == self.filename: - return [entry['sha1']] - return [] - def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing @@ -184,7 +194,7 @@ def normalize_author(self, d): 'https://docs.npmjs.com/files/package.json' \ '#people-fields-author-contributors' - author = {'@type': CODEMETA_URI+'Person'} + author = {'@type': SCHEMA_URI+'Person'} if isinstance(d, dict): name = d.get('name', None) email = d.get('email', None) @@ -197,22 +207,66 @@ else: return None if name: - author[CODEMETA_URI+'name'] = name + author[SCHEMA_URI+'name'] = name if email: - author[CODEMETA_URI+'email'] = email + author[SCHEMA_URI+'email'] = email if url: - author[CODEMETA_URI+'url'] = url + author[SCHEMA_URI+'url'] = url return author @register_mapping -class CodemetaMapping(JsonMapping): +class CodemetaMapping(SingleFileMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ - mapping = CROSSWALK_TABLE['codemeta-V1'] filename = b'codemeta.json' + def translate(self, content): + return self.normalize_translation(expand(json.loads(content.decode()))) + + +@register_mapping +class MavenMapping(DictMapping, SingleFileMapping): + """ + dedicated class for Maven (pom.xml) mapping and translation + """ + filename = b'pom.xml' + mapping = CROSSWALK_TABLE['Java (Maven)'] + + def translate(self, content): + d = xmltodict.parse(content)['project'] + metadata = self.translate_dict(d, normalize=False) + metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) + return self.normalize_translation(metadata) + + _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} + + def parse_repositories(self, d): + """https://maven.apache.org/pom.html#Repositories""" + if 'repositories' not in d: + return [self.parse_repository(d, self._default_repository)] + else: + repositories = d['repositories'].get('repository', []) + if not isinstance(repositories, list): + repositories = [repositories] + results = [] + for repo in repositories: + res = self.parse_repository(d, repo) + if res: + results.append(res) + return results + + def parse_repository(self, d, repo): + if repo.get('layout', 'default') != 'default': + return # TODO ? + url = repo['url'] + if d['groupId']: + url = os.path.join(url, *d['groupId'].split('.')) + if d['artifactId']: + url = os.path.join(url, d['artifactId']) + return url + def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -84,22 +84,22 @@ def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { - 'repository': 'https://codemeta.github.io/terms/codeRepository', - 'os': 'https://codemeta.github.io/terms/operatingSystem', - 'cpu': 'https://codemeta.github.io/terms/processorRequirements', + 'repository': 'http://schema.org/codeRepository', + 'os': 'http://schema.org/operatingSystem', + 'cpu': 'http://schema.org/processorRequirements', 'engines': - 'https://codemeta.github.io/terms/processorRequirements', - 'author': 'https://codemeta.github.io/terms/author', - 'author.email': 'https://codemeta.github.io/terms/email', - 'author.name': 'https://codemeta.github.io/terms/name', - 'contributor': 'https://codemeta.github.io/terms/contributor', - 'keywords': 'https://codemeta.github.io/terms/keywords', - 'license': 'https://codemeta.github.io/terms/license', - 'version': 'https://codemeta.github.io/terms/version', - 'description': 'https://codemeta.github.io/terms/description', - 'name': 'https://codemeta.github.io/terms/name', + 'http://schema.org/processorRequirements', + 'author': 'http://schema.org/author', + 'author.email': 'http://schema.org/email', + 'author.name': 'http://schema.org/name', + 'contributor': 'http://schema.org/contributor', + 'keywords': 'http://schema.org/keywords', + 'license': 'http://schema.org/license', + 'version': 'http://schema.org/version', + 'description': 'http://schema.org/description', + 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', - 'homepage': 'https://codemeta.github.io/terms/url' + 'homepage': 'http://schema.org/url' }) def test_compute_metadata_none(self): @@ -139,15 +139,16 @@ """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:name': 'test_metadata', - 'codemeta:version': '0.0.2', - 'codemeta:description': 'Simple package.json test for indexer', - 'codemeta:codeRepository': + 'type': 'SoftwareSourceCode', + 'name': 'test_metadata', + 'version': '0.0.2', + 'description': 'Simple package.json test for indexer', + 'schema:codeRepository': 'git+https://github.com/moranegg/metadata_test', - 'codemeta:author': { - 'type': 'codemeta:Person', - 'codemeta:name': 'Morane G', - 'codemeta:email': 'moranegg@example.com', + 'schema:author': { + 'type': 'Person', + 'name': 'Morane G', + 'email': 'moranegg@example.com', }, } @@ -163,23 +164,23 @@ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:name': 'test_1', - 'codemeta:version': '0.0.2', - 'codemeta:description': 'Simple package.json test for indexer', - 'codemeta:codeRepository': + 'name': 'test_1', + 'version': '0.0.2', + 'description': 'Simple package.json test for indexer', + 'schema:codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:name': 'test_0_1', - 'codemeta:version': '0.0.2', - 'codemeta:description': 'Simple package.json test for indexer', - 'codemeta:codeRepository': + 'name': 'test_0_1', + 'version': '0.0.2', + 'description': 'Simple package.json test for indexer', + 'schema:codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:name': 'test_metadata', - 'codemeta:version': '0.0.2', - 'codemeta:author': 'moranegg', + 'name': 'test_metadata', + 'version': '0.0.2', + 'schema:author': 'moranegg', }] # when @@ -188,11 +189,11 @@ # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - "codemeta:version": '0.0.2', - "codemeta:description": 'Simple package.json test for indexer', - "codemeta:name": ['test_1', 'test_0_1', 'test_metadata'], - "codemeta:author": 'moranegg', - "codemeta:codeRepository": + "version": '0.0.2', + "description": 'Simple package.json test for indexer', + "name": ['test_1', 'test_0_1', 'test_metadata'], + "schema:author": 'moranegg', + "schema:codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) @@ -220,38 +221,40 @@ 'indexer_configuration_id': 30, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:codeRepository': + 'type': 'SoftwareSourceCode', + 'schema:codeRepository': 'git+https://github.com/moranegg/metadata_test', - 'codemeta:description': 'Simple package.json test for indexer', - 'codemeta:name': 'test_metadata', - 'codemeta:version': '0.0.1' + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/npm/npm/issues', - 'codemeta:author': { - 'type': 'codemeta:Person', - 'codemeta:name': 'Isaac Z. Schlueter', - 'codemeta:email': 'i@izs.me', - 'codemeta:url': 'http://blog.izs.me', + 'schema:author': { + 'type': 'Person', + 'name': 'Isaac Z. Schlueter', + 'email': 'i@izs.me', + 'schema:url': 'http://blog.izs.me', }, - 'codemeta:codeRepository': + 'schema:codeRepository': 'git+https://github.com/npm/npm', - 'codemeta:description': 'a package manager for JavaScript', - 'codemeta:license': 'Artistic-2.0', - 'codemeta:version': '5.0.3', - 'codemeta:name': 'npm', - 'codemeta:keywords': [ + 'description': 'a package manager for JavaScript', + 'schema:license': 'Artistic-2.0', + 'version': '5.0.3', + 'name': 'npm', + 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], - 'codemeta:url': 'https://docs.npmjs.com/' + 'schema:url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { @@ -298,6 +301,151 @@ # then self.assertEqual(expected_results, results) + def test_compute_metadata_valid_codemeta(self): + raw_content = ( + b"""{ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "@type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", + "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "2.0", + "author": [ + { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "@type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "@id": "http://orcid.org/0000-0003-0077-4738" + } + ], + "maintainer": { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + "contIntegration": "https://travis-ci.org/codemeta/codemeta", + "developmentStatus": "active", + "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", + "funder": { + "@id": "https://doi.org/10.13039/100000001", + "@type": "Organization", + "name": "National Science Foundation" + }, + "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", + "keywords": [ + "metadata", + "software" + ], + "version":"2.0", + "dateCreated":"2017-06-05", + "datePublished":"2017-06-05", + "programmingLanguage": "JSON-LD" + }""") # noqa + expected_result = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": + "CodeMeta is a concept vocabulary that can " + "be used to standardize the exchange of software metadata " + "across repositories and organizations.", + "name": + "CodeMeta: Minimal metadata schemas for science " + "software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "2.0", + "author": [ + { + "type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "id": "http://orcid.org/0000-0003-0077-4738" + } + ], + "maintainer": { + "type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "id": "http://orcid.org/0000-0002-1642-628X" + }, + "contIntegration": "https://travis-ci.org/codemeta/codemeta", + "developmentStatus": "active", + "downloadUrl": + "https://github.com/codemeta/codemeta/archive/2.0.zip", + "funder": { + "id": "https://doi.org/10.13039/100000001", + "type": "Organization", + "name": "National Science Foundation" + }, + "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " + "in Scientific Software", + "keywords": [ + "metadata", + "software" + ], + "version": "2.0", + "dateCreated": "2017-06-05", + "datePublished": "2017-06-05", + "programmingLanguage": "JSON-LD" + } + result = MAPPINGS["CodemetaMapping"].translate(raw_content) + self.assertEqual(result, expected_result) + + def test_compute_metadata_maven(self): + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + + central + Maven Repository Switchboard + default + http://repo1.maven.org/maven2 + + false + + + + """ + result = MAPPINGS["MavenMapping"].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'name': 'Maven Default Project', + 'schema:identifier': 'com.mycompany.app', + 'version': '1.2.3', + 'schema:codeRepository': + 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', + }) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() @@ -312,19 +460,19 @@ 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:url': + 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'codemeta:codeRepository': + 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', - 'codemeta:author': 'Andrew Nesbitt', - 'codemeta:license': 'AGPL-3.0', - 'codemeta:version': '1.0.0', - 'codemeta:description': + 'schema:author': 'Andrew Nesbitt', + 'license': 'AGPL-3.0', + 'version': '1.0.0', + 'description': 'Tiny web service for parsing yarn.lock files', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', - 'codemeta:name': 'yarn-parser', - 'codemeta:keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, 'indexer_configuration_id': 7 }])] diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -94,19 +94,19 @@ metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:url': + 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'codemeta:codeRepository': + 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', - 'codemeta:author': 'Andrew Nesbitt', - 'codemeta:license': 'AGPL-3.0', - 'codemeta:version': '1.0.0', - 'codemeta:description': + 'schema:author': 'Andrew Nesbitt', + 'license': 'AGPL-3.0', + 'version': '1.0.0', + 'description': 'Tiny web service for parsing yarn.lock files', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', - 'codemeta:name': 'yarn-parser', - 'codemeta:keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } rev_metadata = { 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -300,19 +300,20 @@ 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', - 'codemeta:version': '1.0.0', - 'codemeta:name': 'yarn-parser', - 'codemeta:author': 'Andrew Nesbitt', - 'codemeta:url': + 'version': '1.0.0', + 'name': 'yarn-parser', + 'schema:author': 'Andrew Nesbitt', + 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'codemeta:processorRequirements': {'node': '7.5'}, - 'codemeta:license': 'AGPL-3.0', - 'codemeta:keywords': ['yarn', 'parse', 'lock', 'dependencies'], - 'codemeta:codeRepository': + 'processorRequirements': {'node': '7.5'}, + 'license': 'AGPL-3.0', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', - 'codemeta:description': + 'description': 'Tiny web service for parsing yarn.lock files', } }]