diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py index 267b7bf..e06744b 100644 --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -1,95 +1,99 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import csv import json import os.path import swh.indexer from pyld import jsonld _DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), 'data') CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, 'codemeta', 'crosswalk.csv') CODEMETA_CONTEXT_PATH = os.path.join(_DATA_DIR, 'codemeta', 'codemeta.jsonld') with open(CODEMETA_CONTEXT_PATH) as fd: CODEMETA_CONTEXT = json.load(fd) CODEMETA_CONTEXT_URL = 'https://doi.org/10.5063/schema/codemeta-2.0' CODEMETA_URI = 'https://codemeta.github.io/terms/' +SCHEMA_URI = 'http://schema.org/' -# CodeMeta properties that we cannot properly represent. PROPERTY_BLACKLIST = { - 'https://codemeta.github.io/terms/softwareRequirements', - 'https://codemeta.github.io/terms/softwareSuggestions', + # CodeMeta properties that we cannot properly represent. + CODEMETA_URI + 'softwareRequirements', + CODEMETA_URI + 'softwareSuggestions', + + # Duplicate of 'author' + CODEMETA_URI + 'creator', } def _read_crosstable(fd): reader = csv.reader(fd) try: header = next(reader) except StopIteration: raise ValueError('empty file') data_sources = set(header) - {'Parent Type', 'Property', 'Type', 'Description'} assert 'codemeta-V1' in data_sources codemeta_translation = {data_source: {} for data_source in data_sources} for line in reader: # For each canonical name canonical_name = CODEMETA_URI + dict(zip(header, line))['Property'] if canonical_name in PROPERTY_BLACKLIST: continue for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description for local_name in value.split('/'): # For each of the data source's properties that maps # to this canonical name if local_name.strip(): codemeta_translation[col][local_name.strip()] = \ canonical_name return codemeta_translation with open(CROSSWALK_TABLE_PATH) as fd: CROSSWALK_TABLE = _read_crosstable(fd) def _document_loader(url): """Document loader for pyld. Reads the local codemeta.jsonld file instead of fetching it from the Internet every single time.""" if url == CODEMETA_CONTEXT_URL: return { 'contextUrl': None, 'documentUrl': url, 'document': CODEMETA_CONTEXT, } elif url == CODEMETA_URI: raise Exception('{} is CodeMeta\'s URI, use {} as context url'.format( CODEMETA_URI, CODEMETA_CONTEXT_URL)) else: raise Exception(url) def compact(doc): """Same as `pyld.jsonld.compact`, but in the context of CodeMeta.""" return jsonld.compact(doc, CODEMETA_CONTEXT_URL, options={'documentLoader': _document_loader}) def expand(doc): """Same as `pyld.jsonld.expand`, but in the context of CodeMeta.""" return jsonld.expand(doc, options={'documentLoader': _document_loader}) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index bf704c8..c2cd7eb 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,177 +1,230 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import re import abc import json import logging -from swh.indexer.codemeta import CROSSWALK_TABLE, compact +from swh.indexer.codemeta import CROSSWALK_TABLE, CODEMETA_URI, compact MAPPINGS = {} def register_mapping(cls): MAPPINGS[cls.__name__] = cls() return cls class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - override translate function """ def __init__(self): self.log = logging.getLogger('%s.%s' % ( self.__class__.__module__, self.__class__.__name__)) @abc.abstractmethod def detect_metadata_files(self, files): """ Detects files potentially containing metadata Args: - file_entries (list): list of files Returns: - empty list if nothing was found - list of sha1 otherwise """ pass @abc.abstractmethod def translate(self, file_content): pass def normalize_translation(self, metadata): return compact(metadata) class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" @property @abc.abstractmethod def mapping(self): """A translation dict to map dict keys into a canonical name.""" pass def translate_dict(self, content_dict): """ Translates content by parsing content from a dict object and translating with the appropriate mapping Args: content_dict (dict) Returns: dict: translated metadata in json-friendly form needed for the indexer """ translated_metadata = {} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key translation_method = getattr(self, 'translate_' + k, None) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table # if there is a normalization method, use it on the value normalization_method = getattr(self, 'normalize_' + k, None) if normalization_method: v = normalization_method(v) # set the translation metadata with the normalized value translated_metadata[self.mapping[k]] = v return self.normalize_translation(translated_metadata) class JsonMapping(DictMapping): """Base class for all mappings that use a JSON file as input.""" @property @abc.abstractmethod def filename(self): """The .json file to extract metadata from.""" pass def detect_metadata_files(self, file_entries): for entry in file_entries: if entry['name'] == self.filename: return [entry['sha1']] return [] def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing json data and translating with the appropriate mapping Args: raw_content: bytes Returns: dict: translated metadata in json-friendly form needed for the indexer """ try: raw_content = raw_content.decode() except UnicodeDecodeError: self.log.warning('Error unidecoding %r', raw_content) return try: content_dict = json.loads(raw_content) except json.JSONDecodeError: self.log.warning('Error unjsoning %r' % raw_content) return return self.translate_dict(content_dict) @register_mapping class NpmMapping(JsonMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' + _schema_shortcuts = { + 'github': 'https://github.com/', + 'gist': 'https://gist.github.com/', + 'bitbucket': 'https://bitbucket.org/', + 'gitlab': 'https://gitlab.com/', + } + def normalize_repository(self, d): - return '{type}+{url}'.format(**d) + """https://docs.npmjs.com/files/package.json#repository""" + if isinstance(d, dict): + return '{type}+{url}'.format(**d) + elif isinstance(d, str): + if '://' in d: + return d + elif ':' in d: + (schema, rest) = d.split(':', 1) + if schema in self._schema_shortcuts: + return self._schema_shortcuts[schema] + rest + else: + return None + else: + return self._schema_shortcuts['github'] + d + + else: + return None def normalize_bugs(self, d): return '{url}'.format(**d) + _parse_author = re.compile(r'^ *' + r'(?P.*?)' + r'( +<(?P.*)>)?' + r'( +\((?P.*)\))?' + r' *$') + + def normalize_author(self, d): + 'https://docs.npmjs.com/files/package.json' \ + '#people-fields-author-contributors' + author = {'@type': CODEMETA_URI+'Person'} + if isinstance(d, dict): + name = d.get('name', None) + email = d.get('email', None) + url = d.get('url', None) + elif isinstance(d, str): + match = self._parse_author.match(d) + name = match.group('name') + email = match.group('email') + url = match.group('url') + else: + return None + if name: + author[CODEMETA_URI+'name'] = name + if email: + author[CODEMETA_URI+'email'] = email + if url: + author[CODEMETA_URI+'url'] = url + return author + @register_mapping class CodemetaMapping(JsonMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ mapping = CROSSWALK_TABLE['codemeta-V1'] filename = b'codemeta.json' def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" result = MAPPINGS["NpmMapping"].translate(raw_content) result1 = MAPPINGS["MavenMapping"].translate(raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index e6e5734..7e78f92 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,319 +1,332 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.metadata import ContentMetadataIndexer from swh.indexer.metadata import RevisionMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage, MockStorage from swh.indexer.tests.test_utils import MockIndexerStorage class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.destination_task = None self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = ContentMetadataTestIndexer def prepare(self): self.config = { 'storage': { 'cls': 'remote', 'args': { 'url': 'http://localhost:9999', } }, 'tools': { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } } self.storage = MockStorage() self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.destination_task = None self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.content_tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } MockIndexerStorage.added_data = [] def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { 'repository': 'https://codemeta.github.io/terms/codeRepository', 'os': 'https://codemeta.github.io/terms/operatingSystem', 'cpu': 'https://codemeta.github.io/terms/processorRequirements', 'engines': 'https://codemeta.github.io/terms/processorRequirements', - 'author': 'https://codemeta.github.io/terms/creator', + 'author': 'https://codemeta.github.io/terms/author', 'author.email': 'https://codemeta.github.io/terms/email', 'author.name': 'https://codemeta.github.io/terms/name', 'contributor': 'https://codemeta.github.io/terms/contributor', 'keywords': 'https://codemeta.github.io/terms/keywords', 'license': 'https://codemeta.github.io/terms/license', 'version': 'https://codemeta.github.io/terms/version', 'description': 'https://codemeta.github.io/terms/description', 'name': 'https://codemeta.github.io/terms/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'https://codemeta.github.io/terms/url' }) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" + }, + "author": { + "email": "moranegg@example.com", + "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'codemeta:name': 'test_metadata', 'codemeta:version': '0.0.2', 'codemeta:description': 'Simple package.json test for indexer', 'codemeta:codeRepository': 'git+https://github.com/moranegg/metadata_test', + 'codemeta:author': { + 'type': 'codemeta:Person', + 'codemeta:name': 'Morane G', + 'codemeta:email': 'moranegg@example.com', + }, } # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'codemeta:name': 'test_1', 'codemeta:version': '0.0.2', 'codemeta:description': 'Simple package.json test for indexer', 'codemeta:codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'codemeta:name': 'test_0_1', 'codemeta:version': '0.0.2', 'codemeta:description': 'Simple package.json test for indexer', 'codemeta:codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'codemeta:name': 'test_metadata', 'codemeta:version': '0.0.2', 'codemeta:author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "codemeta:version": '0.0.2', "codemeta:description": 'Simple package.json test for indexer', "codemeta:name": ['test_1', 'test_0_1', 'test_metadata'], "codemeta:author": 'moranegg', "codemeta:codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', 'd4c647f0fc257591cc9ba1722484229780d1c607', '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = ContentMetadataTestIndexer( tool=self.content_tool, config={}) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = metadata_indexer.idx_storage.added_data expected_results = [('content_metadata', False, [{ 'indexer_configuration_id': 30, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'codemeta:codeRepository': 'git+https://github.com/moranegg/metadata_test', 'codemeta:description': 'Simple package.json test for indexer', 'codemeta:name': 'test_metadata', 'codemeta:version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'codemeta:issueTracker': 'https://github.com/npm/npm/issues', - 'codemeta:creator': - 'Isaac Z. Schlueter (http://blog.izs.me)', + 'codemeta:author': { + 'type': 'codemeta:Person', + 'codemeta:name': 'Isaac Z. Schlueter', + 'codemeta:email': 'i@izs.me', + 'codemeta:url': 'http://blog.izs.me', + }, 'codemeta:codeRepository': 'git+https://github.com/npm/npm', 'codemeta:description': 'a package manager for JavaScript', 'codemeta:license': 'Artistic-2.0', 'codemeta:version': '5.0.3', 'codemeta:name': 'npm', 'codemeta:keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'codemeta:url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { 'indexer_configuration_id': 30, 'translated_metadata': None, 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' }])] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() sha1_gits = [ b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', ] metadata_indexer.run(sha1_gits, 'update-dups') results = metadata_indexer.idx_storage.added_data expected_results = [('revision_metadata', True, [{ 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'codemeta:url': 'https://github.com/librariesio/yarn-parser#readme', 'codemeta:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'codemeta:author': 'Andrew Nesbitt', 'codemeta:license': 'AGPL-3.0', 'codemeta:version': '1.0.0', 'codemeta:description': 'Tiny web service for parsing yarn.lock files', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'codemeta:name': 'yarn-parser', 'codemeta:keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, 'indexer_configuration_id': 7 }])] # then self.assertEqual(expected_results, results)