diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ click chardet file_magic +pyld +xmltodict diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py new file mode 100644 --- /dev/null +++ b/swh/indexer/codemeta.py @@ -0,0 +1,120 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import csv +import json +import os.path + +import swh.indexer +from pyld import jsonld + +_DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), 'data') + +CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, 'codemeta', 'crosswalk.csv') + +CODEMETA_CONTEXT_PATH = os.path.join(_DATA_DIR, 'codemeta', 'codemeta.jsonld') + + +with open(CODEMETA_CONTEXT_PATH) as fd: + CODEMETA_CONTEXT = json.load(fd) + +CODEMETA_CONTEXT_URL = 'https://doi.org/10.5063/schema/codemeta-2.0' +CODEMETA_URI = 'https://codemeta.github.io/terms/' +SCHEMA_URI = 'http://schema.org/' + + +PROPERTY_BLACKLIST = { + # CodeMeta properties that we cannot properly represent. + SCHEMA_URI + 'softwareRequirements', + CODEMETA_URI + 'softwareSuggestions', + + # Duplicate of 'author' + SCHEMA_URI + 'creator', + } + + +def make_absolute_uri(local_name): + definition = CODEMETA_CONTEXT['@context'][local_name] + if isinstance(definition, str): + return definition + elif isinstance(definition, dict): + prefixed_name = definition['@id'] + (prefix, local_name) = prefixed_name.split(':') + if prefix == 'schema': + canonical_name = SCHEMA_URI + local_name + elif prefix == 'codemeta': + canonical_name = CODEMETA_URI + local_name + else: + assert False, prefix + return canonical_name + else: + assert False, definition + + +def _read_crosstable(fd): + reader = csv.reader(fd) + try: + header = next(reader) + except StopIteration: + raise ValueError('empty file') + + data_sources = set(header) - {'Parent Type', 'Property', + 'Type', 'Description'} + assert 'codemeta-V1' in data_sources + + codemeta_translation = {data_source: {} for data_source in data_sources} + + for line in reader: # For each canonical name + local_name = dict(zip(header, line))['Property'] + if not local_name: + continue + canonical_name = make_absolute_uri(local_name) + if canonical_name in PROPERTY_BLACKLIST: + continue + for (col, value) in zip(header, line): # For each cell in the row + if col in data_sources: + # If that's not the parentType/property/type/description + for local_name in value.split('/'): + # For each of the data source's properties that maps + # to this canonical name + if local_name.strip(): + codemeta_translation[col][local_name.strip()] = \ + canonical_name + + return codemeta_translation + + +with open(CROSSWALK_TABLE_PATH) as fd: + CROSSWALK_TABLE = _read_crosstable(fd) + + +def _document_loader(url): + """Document loader for pyld. + + Reads the local codemeta.jsonld file instead of fetching it + from the Internet every single time.""" + if url == CODEMETA_CONTEXT_URL: + return { + 'contextUrl': None, + 'documentUrl': url, + 'document': CODEMETA_CONTEXT, + } + elif url == CODEMETA_URI: + raise Exception('{} is CodeMeta\'s URI, use {} as context url'.format( + CODEMETA_URI, CODEMETA_CONTEXT_URL)) + else: + raise Exception(url) + + +def compact(doc): + """Same as `pyld.jsonld.compact`, but in the context of CodeMeta.""" + return jsonld.compact(doc, CODEMETA_CONTEXT_URL, + options={'documentLoader': _document_loader}) + + +def expand(doc): + """Same as `pyld.jsonld.expand`, but in the context of CodeMeta.""" + return jsonld.expand(doc, + options={'documentLoader': _document_loader}) diff --git a/swh/indexer/data/codemeta/CITATION b/swh/indexer/data/codemeta/CITATION new file mode 100644 --- /dev/null +++ b/swh/indexer/data/codemeta/CITATION @@ -0,0 +1,2 @@ +Matthew B. Jones, Carl Boettiger, Abby Cabunoc Mayes, Arfon Smith, Peter Slaughter, Kyle Niemeyer, Yolanda Gil, Martin Fenner, Krzysztof Nowak, Mark Hahnel, Luke Coy, Alice Allen, Mercè Crosas, Ashley Sands, Neil Chue Hong, Patricia Cruse, Daniel S. Katz, Carole Goble. 2017. CodeMeta: an exchange schema for software metadata. Version 2.0. KNB Data Repository. doi:10.5063/schema/codemeta-2.0 +swh:1:dir:39c509fd2002f9e531fb4b3a321ceb5e6994e54a;origin=https://github.com/codemeta/codemeta diff --git a/swh/indexer/data/codemeta/codemeta.jsonld b/swh/indexer/data/codemeta/codemeta.jsonld new file mode 100644 --- /dev/null +++ b/swh/indexer/data/codemeta/codemeta.jsonld @@ -0,0 +1,80 @@ +{ + "@context": { + "type": "@type", + "id": "@id", + "schema":"http://schema.org/", + "codemeta": "https://codemeta.github.io/terms/", + "Organization": {"@id": "schema:Organization"}, + "Person": {"@id": "schema:Person"}, + "SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"}, + "SoftwareApplication": {"@id": "schema:SoftwareApplication"}, + "Text": {"@id": "schema:Text"}, + "URL": {"@id": "schema:URL"}, + "address": { "@id": "schema:address"}, + "affiliation": { "@id": "schema:affiliation"}, + "applicationCategory": { "@id": "schema:applicationCategory", "@type": "@id"}, + "applicationSubCategory": { "@id": "schema:applicationSubCategory", "@type": "@id"}, + "citation": { "@id": "schema:citation"}, + "codeRepository": { "@id": "schema:codeRepository", "@type": "@id"}, + "contributor": { "@id": "schema:contributor"}, + "copyrightHolder": { "@id": "schema:copyrightHolder"}, + "copyrightYear": { "@id": "schema:copyrightYear"}, + "creator": { "@id": "schema:creator"}, + "dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date" }, + "dateModified": {"@id": "schema:dateModified", "@type": "schema:Date" }, + "datePublished": {"@id": "schema:datePublished", "@type": "schema:Date" }, + "description": { "@id": "schema:description"}, + "downloadUrl": { "@id": "schema:downloadUrl", "@type": "@id"}, + "email": { "@id": "schema:email"}, + "editor": { "@id": "schema:editor"}, + "encoding": { "@id": "schema:encoding"}, + "familyName": { "@id": "schema:familyName"}, + "fileFormat": { "@id": "schema:fileFormat", "@type": "@id"}, + "fileSize": { "@id": "schema:fileSize"}, + "funder": { "@id": "schema:funder"}, + "givenName": { "@id": "schema:givenName"}, + "hasPart": { "@id": "schema:hasPart" }, + "identifier": { "@id": "schema:identifier", "@type": "@id"}, + "installUrl": { "@id": "schema:installUrl", "@type": "@id"}, + "isAccessibleForFree": { "@id": "schema:isAccessibleForFree"}, + "isPartOf": { "@id": "schema:isPartOf"}, + "keywords": { "@id": "schema:keywords"}, + "license": { "@id": "schema:license", "@type": "@id"}, + "memoryRequirements": { "@id": "schema:memoryRequirements", "@type": "@id"}, + "name": { "@id": "schema:name"}, + "operatingSystem": { "@id": "schema:operatingSystem"}, + "permissions": { "@id": "schema:permissions"}, + "position": { "@id": "schema:position"}, + "processorRequirements": { "@id": "schema:processorRequirements"}, + "producer": { "@id": "schema:producer"}, + "programmingLanguage": { "@id": "schema:programmingLanguage"}, + "provider": { "@id": "schema:provider"}, + "publisher": { "@id": "schema:publisher"}, + "relatedLink": { "@id": "schema:relatedLink", "@type": "@id"}, + "releaseNotes": { "@id": "schema:releaseNotes", "@type": "@id"}, + "runtimePlatform": { "@id": "schema:runtimePlatform"}, + "sameAs": { "@id": "schema:sameAs", "@type": "@id"}, + "softwareHelp": { "@id": "schema:softwareHelp"}, + "softwareRequirements": { "@id": "schema:softwareRequirements", "@type": "@id"}, + "softwareVersion": { "@id": "schema:softwareVersion"}, + "sponsor": { "@id": "schema:sponsor"}, + "storageRequirements": { "@id": "schema:storageRequirements", "@type": "@id"}, + "supportingData": { "@id": "schema:supportingData"}, + "targetProduct": { "@id": "schema:targetProduct"}, + "url": { "@id": "schema:url", "@type": "@id"}, + "version": { "@id": "schema:version"}, + + "author": { "@id": "schema:author", "@container": "@list" }, + + "softwareSuggestions": { "@id": "codemeta:softwareSuggestions", "@type": "@id"}, + "contIntegration": { "@id": "codemeta:contIntegration", "@type": "@id"}, + "buildInstructions": { "@id": "codemeta:buildInstructions", "@type": "@id"}, + "developmentStatus": { "@id": "codemeta:developmentStatus", "@type": "@id"}, + "embargoDate": { "@id":"codemeta:embargoDate", "@type": "schema:Date" }, + "funding": { "@id": "codemeta:funding" }, + "readme": { "@id":"codemeta:readme", "@type": "@id" }, + "issueTracker": { "@id":"codemeta:issueTracker", "@type": "@id" }, + "referencePublication": { "@id": "codemeta:referencePublication", "@type": "@id"}, + "maintainer": { "@id": "codemeta:maintainer" } + } +} diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -160,7 +160,7 @@ - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - - translated_metadata (bytes): dict of retrieved metadata + - translated_metadata: dict of retrieved metadata """ try: @@ -175,7 +175,7 @@ files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( - detected_files) + detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -3,7 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - +from swh.indexer.codemeta import compact, expand +from swh.indexer.codemeta import make_absolute_uri from swh.indexer.metadata_dictionary import MAPPINGS @@ -25,6 +26,16 @@ return results +_MINIMAL_PROPERTY_SET = { + "developmentStatus", "version", "operatingSystem", "description", + "keywords", "issueTracker", "name", "author", "relatedLink", + "url", "license", "maintainer", "email", "identifier", + "codeRepository"} + +MINIMAL_METADATA_SET = {make_absolute_uri(prop) + for prop in _MINIMAL_PROPERTY_SET} + + def extract_minimal_metadata_dict(metadata_list): """ Every item in the metadata_list is a dict of translated_metadata in the @@ -37,29 +48,13 @@ Returns: - minimal_dict (dict): one dict with selected values of metadata """ - minimal_dict = { - "developmentStatus": [], - "version": [], - "operatingSystem": [], - "description": [], - "keywords": [], - "issueTracker": [], - "name": [], - "author": [], - "relatedLink": [], - "url": [], - "license": [], - "maintainer": [], - "email": [], - "softwareRequirements": [], - "identifier": [], - "codeRepository": [] - } - for term in minimal_dict.keys(): - for metadata_item in metadata_list: - if term in metadata_item: - if not metadata_item[term] in minimal_dict[term]: - minimal_dict[term].append(metadata_item[term]) - if not minimal_dict[term]: - minimal_dict[term] = None - return minimal_dict + minimal_dict = {} + for document in metadata_list: + for metadata_item in expand(document): + for (term, value) in metadata_item.items(): + if term in MINIMAL_METADATA_SET: + if term not in minimal_dict: + minimal_dict[term] = [value] + elif value not in minimal_dict[term]: + minimal_dict[term].append(value) + return compact(minimal_dict) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -3,48 +3,15 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os +import re import abc -import csv import json -import os.path import logging +import xmltodict -import swh.indexer - -CROSSWALK_TABLE_PATH = os.path.join(os.path.dirname(swh.indexer.__file__), - 'data', 'codemeta', 'crosswalk.csv') - - -def read_crosstable(fd): - reader = csv.reader(fd) - try: - header = next(reader) - except StopIteration: - raise ValueError('empty file') - - data_sources = set(header) - {'Parent Type', 'Property', - 'Type', 'Description'} - assert 'codemeta-V1' in data_sources - - codemeta_translation = {data_source: {} for data_source in data_sources} - - for line in reader: # For each canonical name - canonical_name = dict(zip(header, line))['Property'] - for (col, value) in zip(header, line): # For each cell in the row - if col in data_sources: - # If that's not the parentType/property/type/description - for local_name in value.split('/'): - # For each of the data source's properties that maps - # to this canonical name - if local_name.strip(): - codemeta_translation[col][local_name.strip()] = \ - canonical_name - - return codemeta_translation - - -with open(CROSSWALK_TABLE_PATH) as fd: - CROSSWALK_TABLE = read_crosstable(fd) +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import compact, expand MAPPINGS = {} @@ -85,6 +52,25 @@ def translate(self, file_content): pass + def normalize_translation(self, metadata): + return compact(metadata) + + +class SingleFileMapping(BaseMapping): + """Base class for all mappings that use a single file as input.""" + + @property + @abc.abstractmethod + def filename(self): + """The .json file to extract metadata from.""" + pass + + def detect_metadata_files(self, file_entries): + for entry in file_entries: + if entry['name'] == self.filename: + return [entry['sha1']] + return [] + class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly @@ -96,7 +82,7 @@ """A translation dict to map dict keys into a canonical name.""" pass - def translate_dict(self, content_dict): + def translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object and translating with the appropriate mapping @@ -109,51 +95,33 @@ the indexer """ - translated_metadata = {} - default = 'other' - translated_metadata['other'] = {} - try: - for k, v in content_dict.items(): - try: - term = self.mapping.get(k, default) - if term not in translated_metadata: - translated_metadata[term] = v - continue - if isinstance(translated_metadata[term], str): - in_value = translated_metadata[term] - translated_metadata[term] = [in_value, v] - continue - if isinstance(translated_metadata[term], list): - translated_metadata[term].append(v) - continue - if isinstance(translated_metadata[term], dict): - translated_metadata[term][k] = v - continue - except KeyError: - self.log.exception( - "Problem during item mapping") - continue - except Exception: - raise - return None - return translated_metadata - - -class JsonMapping(DictMapping): + translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} + for k, v in content_dict.items(): + # First, check if there is a specific translation + # method for this key + translation_method = getattr(self, 'translate_' + k, None) + if translation_method: + translation_method(translated_metadata, v) + elif k in self.mapping: + # if there is no method, but the key is known from the + # crosswalk table + + # if there is a normalization method, use it on the value + normalization_method = getattr(self, 'normalize_' + k, None) + if normalization_method: + v = normalization_method(v) + + # set the translation metadata with the normalized value + translated_metadata[self.mapping[k]] = v + if normalize: + return self.normalize_translation(translated_metadata) + else: + return translated_metadata + + +class JsonMapping(DictMapping, SingleFileMapping): """Base class for all mappings that use a JSON file as input.""" - @property - @abc.abstractmethod - def filename(self): - """The .json file to extract metadata from.""" - pass - - def detect_metadata_files(self, file_entries): - for entry in file_entries: - if entry['name'] == self.filename: - return [entry['sha1']] - return [] - def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing @@ -188,15 +156,117 @@ mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' + _schema_shortcuts = { + 'github': 'https://github.com/', + 'gist': 'https://gist.github.com/', + 'bitbucket': 'https://bitbucket.org/', + 'gitlab': 'https://gitlab.com/', + } + + def normalize_repository(self, d): + """https://docs.npmjs.com/files/package.json#repository""" + if isinstance(d, dict): + return '{type}+{url}'.format(**d) + elif isinstance(d, str): + if '://' in d: + return d + elif ':' in d: + (schema, rest) = d.split(':', 1) + if schema in self._schema_shortcuts: + return self._schema_shortcuts[schema] + rest + else: + return None + else: + return self._schema_shortcuts['github'] + d + + else: + return None + + def normalize_bugs(self, d): + return '{url}'.format(**d) + + _parse_author = re.compile(r'^ *' + r'(?P.*?)' + r'( +<(?P.*)>)?' + r'( +\((?P.*)\))?' + r' *$') + + def normalize_author(self, d): + 'https://docs.npmjs.com/files/package.json' \ + '#people-fields-author-contributors' + author = {'@type': SCHEMA_URI+'Person'} + if isinstance(d, dict): + name = d.get('name', None) + email = d.get('email', None) + url = d.get('url', None) + elif isinstance(d, str): + match = self._parse_author.match(d) + name = match.group('name') + email = match.group('email') + url = match.group('url') + else: + return None + if name: + author[SCHEMA_URI+'name'] = name + if email: + author[SCHEMA_URI+'email'] = email + if url: + author[SCHEMA_URI+'url'] = url + return author + @register_mapping -class CodemetaMapping(JsonMapping): +class CodemetaMapping(SingleFileMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ - mapping = CROSSWALK_TABLE['codemeta-V1'] filename = b'codemeta.json' + def translate(self, content): + return self.normalize_translation(expand(json.loads(content.decode()))) + + +@register_mapping +class MavenMapping(DictMapping, SingleFileMapping): + """ + dedicated class for Maven (pom.xml) mapping and translation + """ + filename = b'pom.xml' + mapping = CROSSWALK_TABLE['Java (Maven)'] + + def translate(self, content): + d = xmltodict.parse(content)['project'] + metadata = self.translate_dict(d, normalize=False) + metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) + return self.normalize_translation(metadata) + + _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} + + def parse_repositories(self, d): + """https://maven.apache.org/pom.html#Repositories""" + if 'repositories' not in d: + return [self.parse_repository(d, self._default_repository)] + else: + repositories = d['repositories'].get('repository', []) + if not isinstance(repositories, list): + repositories = [repositories] + results = [] + for repo in repositories: + res = self.parse_repository(d, repo) + if res: + results.append(res) + return results + + def parse_repository(self, d, repo): + if repo.get('layout', 'default') != 'default': + return # TODO ? + url = repo['url'] + if d['groupId']: + url = os.path.join(url, *d['groupId'].split('.')) + if d['artifactId']: + url = os.path.join(url, d['artifactId']) + return url + def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -84,27 +84,22 @@ def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { - 'repository': 'codeRepository', - 'os': 'operatingSystem', - 'cpu': 'processorRequirements', - 'engines': 'processorRequirements', - 'dependencies': 'softwareRequirements', - 'bundleDependencies': 'softwareRequirements', - 'bundledDependencies': 'softwareRequirements', - 'peerDependencies': 'softwareRequirements', - 'author': 'creator', - 'author.email': 'email', - 'author.name': 'name', - 'contributor': 'contributor', - 'keywords': 'keywords', - 'license': 'license', - 'version': 'version', - 'description': 'description', - 'name': 'name', - 'devDependencies': 'softwareSuggestions', - 'optionalDependencies': 'softwareSuggestions', - 'bugs': 'issueTracker', - 'homepage': 'url' + 'repository': 'http://schema.org/codeRepository', + 'os': 'http://schema.org/operatingSystem', + 'cpu': 'http://schema.org/processorRequirements', + 'engines': + 'http://schema.org/processorRequirements', + 'author': 'http://schema.org/author', + 'author.email': 'http://schema.org/email', + 'author.name': 'http://schema.org/name', + 'contributor': 'http://schema.org/contributor', + 'keywords': 'http://schema.org/keywords', + 'license': 'http://schema.org/license', + 'version': 'http://schema.org/version', + 'description': 'http://schema.org/description', + 'name': 'http://schema.org/name', + 'bugs': 'https://codemeta.github.io/terms/issueTracker', + 'homepage': 'http://schema.org/url' }) def test_compute_metadata_none(self): @@ -135,18 +130,26 @@ "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" + }, + "author": { + "email": "moranegg@example.com", + "name": "Morane G" } } """ declared_metadata = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', 'name': 'test_metadata', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', - 'codeRepository': { - 'type': 'git', - 'url': 'https://github.com/moranegg/metadata_test' - }, - 'other': {} + 'schema:codeRepository': + 'git+https://github.com/moranegg/metadata_test', + 'schema:author': { + 'type': 'Person', + 'name': 'Morane G', + 'email': 'moranegg@example.com', + }, } # when @@ -160,28 +163,24 @@ """ # given metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', - 'codeRepository': { - 'type': 'git', - 'url': 'https://github.com/moranegg/metadata_test' - }, - 'other': {} + 'schema:codeRepository': + 'git+https://github.com/moranegg/metadata_test', }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', - 'codeRepository': { - 'type': 'git', - 'url': 'https://github.com/moranegg/metadata_test' - }, - 'other': {} + 'schema:codeRepository': + 'git+https://github.com/moranegg/metadata_test' }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', - 'author': 'moranegg', - 'other': {} + 'schema:author': 'moranegg', }] # when @@ -189,25 +188,13 @@ # then expected_results = { - "developmentStatus": None, - "version": ['0.0.2'], - "operatingSystem": None, - "description": ['Simple package.json test for indexer'], - "keywords": None, - "issueTracker": None, + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + "version": '0.0.2', + "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], - "author": ['moranegg'], - "relatedLink": None, - "url": None, - "license": None, - "maintainer": None, - "email": None, - "softwareRequirements": None, - "identifier": None, - "codeRepository": [{ - 'type': 'git', - 'url': 'https://github.com/moranegg/metadata_test' - }] + "schema:author": 'moranegg', + "schema:codeRepository": + 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) @@ -233,11 +220,10 @@ expected_results = [('content_metadata', False, [{ 'indexer_configuration_id': 30, 'translated_metadata': { - 'other': {}, - 'codeRepository': { - 'type': 'git', - 'url': 'https://github.com/moranegg/metadata_test' - }, + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'schema:codeRepository': + 'git+https://github.com/moranegg/metadata_test', 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' @@ -246,35 +232,21 @@ }, { 'indexer_configuration_id': 30, 'translated_metadata': { - 'softwareRequirements': { - 'JSONStream': '~1.3.1', - 'abbrev': '~1.1.0', - 'ansi-regex': '~2.1.1', - 'ansicolors': '~0.3.2', - 'ansistyles': '~0.1.3' - }, - 'issueTracker': { - 'url': 'https://github.com/npm/npm/issues' - }, - 'creator': - 'Isaac Z. Schlueter (http://blog.izs.me)', - 'codeRepository': { - 'type': 'git', - 'url': 'https://github.com/npm/npm' + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'codemeta:issueTracker': + 'https://github.com/npm/npm/issues', + 'schema:author': { + 'type': 'Person', + 'name': 'Isaac Z. Schlueter', + 'email': 'i@izs.me', + 'schema:url': 'http://blog.izs.me', }, + 'schema:codeRepository': + 'git+https://github.com/npm/npm', 'description': 'a package manager for JavaScript', - 'softwareSuggestions': { - 'tacks': '~1.2.6', - 'tap': '~10.3.2' - }, - 'license': 'Artistic-2.0', + 'schema:license': 'Artistic-2.0', 'version': '5.0.3', - 'other': { - 'preferGlobal': True, - 'config': { - 'publishtest': False - } - }, 'name': 'npm', 'keywords': [ 'install', @@ -282,7 +254,7 @@ 'package manager', 'package.json' ], - 'url': 'https://docs.npmjs.com/' + 'schema:url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { @@ -329,6 +301,151 @@ # then self.assertEqual(expected_results, results) + def test_compute_metadata_valid_codemeta(self): + raw_content = ( + b"""{ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "@type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", + "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "2.0", + "author": [ + { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "@type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "@id": "http://orcid.org/0000-0003-0077-4738" + } + ], + "maintainer": { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + "contIntegration": "https://travis-ci.org/codemeta/codemeta", + "developmentStatus": "active", + "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", + "funder": { + "@id": "https://doi.org/10.13039/100000001", + "@type": "Organization", + "name": "National Science Foundation" + }, + "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", + "keywords": [ + "metadata", + "software" + ], + "version":"2.0", + "dateCreated":"2017-06-05", + "datePublished":"2017-06-05", + "programmingLanguage": "JSON-LD" + }""") # noqa + expected_result = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": + "CodeMeta is a concept vocabulary that can " + "be used to standardize the exchange of software metadata " + "across repositories and organizations.", + "name": + "CodeMeta: Minimal metadata schemas for science " + "software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "2.0", + "author": [ + { + "type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "id": "http://orcid.org/0000-0003-0077-4738" + } + ], + "maintainer": { + "type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "id": "http://orcid.org/0000-0002-1642-628X" + }, + "contIntegration": "https://travis-ci.org/codemeta/codemeta", + "developmentStatus": "active", + "downloadUrl": + "https://github.com/codemeta/codemeta/archive/2.0.zip", + "funder": { + "id": "https://doi.org/10.13039/100000001", + "type": "Organization", + "name": "National Science Foundation" + }, + "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " + "in Scientific Software", + "keywords": [ + "metadata", + "software" + ], + "version": "2.0", + "dateCreated": "2017-06-05", + "datePublished": "2017-06-05", + "programmingLanguage": "JSON-LD" + } + result = MAPPINGS["CodemetaMapping"].translate(raw_content) + self.assertEqual(result, expected_result) + + def test_compute_metadata_maven(self): + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + + central + Maven Repository Switchboard + default + http://repo1.maven.org/maven2 + + false + + + + """ + result = MAPPINGS["MavenMapping"].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'name': 'Maven Default Project', + 'schema:identifier': 'com.mycompany.app', + 'version': '1.2.3', + 'schema:codeRepository': + 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', + }) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() @@ -342,35 +459,20 @@ expected_results = [('revision_metadata', True, [{ 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': { - 'identifier': None, - 'maintainer': None, - 'url': [ - 'https://github.com/librariesio/yarn-parser#readme' - ], - 'codeRepository': [{ - 'type': 'git', - 'url': 'git+https://github.com/librariesio/yarn-parser.git' - }], - 'author': ['Andrew Nesbitt'], - 'license': ['AGPL-3.0'], - 'version': ['1.0.0'], - 'description': [ - 'Tiny web service for parsing yarn.lock files' - ], - 'relatedLink': None, - 'developmentStatus': None, - 'operatingSystem': None, - 'issueTracker': [{ - 'url': 'https://github.com/librariesio/yarn-parser/issues' - }], - 'softwareRequirements': [{ - 'express': '^4.14.0', - 'yarn': '^0.21.0', - 'body-parser': '^1.15.2' - }], - 'name': ['yarn-parser'], - 'keywords': [['yarn', 'parse', 'lock', 'dependencies']], - 'email': None + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'url': + 'https://github.com/librariesio/yarn-parser#readme', + 'schema:codeRepository': + 'git+https://github.com/librariesio/yarn-parser.git', + 'schema:author': 'Andrew Nesbitt', + 'license': 'AGPL-3.0', + 'version': '1.0.0', + 'description': + 'Tiny web service for parsing yarn.lock files', + 'codemeta:issueTracker': + 'https://github.com/librariesio/yarn-parser/issues', + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, 'indexer_configuration_id': 7 }])] diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -93,35 +93,20 @@ self.run_ready_tasks() # Run the second task metadata = { - 'identifier': None, - 'maintainer': None, - 'url': [ - 'https://github.com/librariesio/yarn-parser#readme' - ], - 'codeRepository': [{ - 'type': 'git', - 'url': 'git+https://github.com/librariesio/yarn-parser.git' - }], - 'author': ['Andrew Nesbitt'], - 'license': ['AGPL-3.0'], - 'version': ['1.0.0'], - 'description': [ - 'Tiny web service for parsing yarn.lock files' - ], - 'relatedLink': None, - 'developmentStatus': None, - 'operatingSystem': None, - 'issueTracker': [{ - 'url': 'https://github.com/librariesio/yarn-parser/issues' - }], - 'softwareRequirements': [{ - 'express': '^4.14.0', - 'yarn': '^0.21.0', - 'body-parser': '^1.15.2' - }], - 'name': ['yarn-parser'], - 'keywords': [['yarn', 'parse', 'lock', 'dependencies']], - 'email': None + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'url': + 'https://github.com/librariesio/yarn-parser#readme', + 'schema:codeRepository': + 'git+https://github.com/librariesio/yarn-parser.git', + 'schema:author': 'Andrew Nesbitt', + 'license': 'AGPL-3.0', + 'version': '1.0.0', + 'description': + 'Tiny web service for parsing yarn.lock files', + 'codemeta:issueTracker': + 'https://github.com/librariesio/yarn-parser/issues', + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } rev_metadata = { 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -299,31 +299,22 @@ }, 'id': b'cde', 'translated_metadata': { - 'issueTracker': { - 'url': 'https://github.com/librariesio/yarn-parser/issues' - }, + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'codemeta:issueTracker': + 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', - 'author': 'Andrew Nesbitt', - 'url': 'https://github.com/librariesio/yarn-parser#readme', + 'schema:author': 'Andrew Nesbitt', + 'url': + 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, - 'other': { - 'scripts': { - 'start': 'node index.js' - }, - 'main': 'index.js' - }, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], - 'codeRepository': { - 'type': 'git', - 'url': 'git+https://github.com/librariesio/yarn-parser.git' - }, - 'description': 'Tiny web service for parsing yarn.lock files', - 'softwareRequirements': { - 'yarn': '^0.21.0', - 'express': '^4.14.0', - 'body-parser': '^1.15.2'} + 'schema:codeRepository': + 'git+https://github.com/librariesio/yarn-parser.git', + 'description': + 'Tiny web service for parsing yarn.lock files', } }] @@ -339,9 +330,8 @@ if origin[k] != v: break else: - # This block is run if and only if we didn't break, - # ie. if all supplied parts of the id are set to the - # expected value. + # This block is run iff we didn't break, ie. if all supplied + # parts of the id are set to the expected value. return origin assert False, id_