diff --git a/requirements.txt b/requirements.txt index 87ecc1f..3a7428c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ vcversioner pygments click chardet file_magic pyld +xmltodict diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py index e06744b..4548029 100644 --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -1,99 +1,120 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import csv import json import os.path import swh.indexer from pyld import jsonld _DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), 'data') CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, 'codemeta', 'crosswalk.csv') CODEMETA_CONTEXT_PATH = os.path.join(_DATA_DIR, 'codemeta', 'codemeta.jsonld') with open(CODEMETA_CONTEXT_PATH) as fd: CODEMETA_CONTEXT = json.load(fd) CODEMETA_CONTEXT_URL = 'https://doi.org/10.5063/schema/codemeta-2.0' CODEMETA_URI = 'https://codemeta.github.io/terms/' SCHEMA_URI = 'http://schema.org/' PROPERTY_BLACKLIST = { # CodeMeta properties that we cannot properly represent. - CODEMETA_URI + 'softwareRequirements', + SCHEMA_URI + 'softwareRequirements', CODEMETA_URI + 'softwareSuggestions', # Duplicate of 'author' - CODEMETA_URI + 'creator', + SCHEMA_URI + 'creator', } +def make_absolute_uri(local_name): + definition = CODEMETA_CONTEXT['@context'][local_name] + if isinstance(definition, str): + return definition + elif isinstance(definition, dict): + prefixed_name = definition['@id'] + (prefix, local_name) = prefixed_name.split(':') + if prefix == 'schema': + canonical_name = SCHEMA_URI + local_name + elif prefix == 'codemeta': + canonical_name = CODEMETA_URI + local_name + else: + assert False, prefix + return canonical_name + else: + assert False, definition + + def _read_crosstable(fd): reader = csv.reader(fd) try: header = next(reader) except StopIteration: raise ValueError('empty file') data_sources = set(header) - {'Parent Type', 'Property', 'Type', 'Description'} assert 'codemeta-V1' in data_sources codemeta_translation = {data_source: {} for data_source in data_sources} for line in reader: # For each canonical name - canonical_name = CODEMETA_URI + dict(zip(header, line))['Property'] + local_name = dict(zip(header, line))['Property'] + if not local_name: + continue + canonical_name = make_absolute_uri(local_name) if canonical_name in PROPERTY_BLACKLIST: continue for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description for local_name in value.split('/'): # For each of the data source's properties that maps # to this canonical name if local_name.strip(): codemeta_translation[col][local_name.strip()] = \ canonical_name return codemeta_translation with open(CROSSWALK_TABLE_PATH) as fd: CROSSWALK_TABLE = _read_crosstable(fd) def _document_loader(url): """Document loader for pyld. Reads the local codemeta.jsonld file instead of fetching it from the Internet every single time.""" if url == CODEMETA_CONTEXT_URL: return { 'contextUrl': None, 'documentUrl': url, 'document': CODEMETA_CONTEXT, } elif url == CODEMETA_URI: raise Exception('{} is CodeMeta\'s URI, use {} as context url'.format( CODEMETA_URI, CODEMETA_CONTEXT_URL)) else: raise Exception(url) def compact(doc): """Same as `pyld.jsonld.compact`, but in the context of CodeMeta.""" return jsonld.compact(doc, CODEMETA_CONTEXT_URL, options={'documentLoader': _document_loader}) def expand(doc): """Same as `pyld.jsonld.expand`, but in the context of CodeMeta.""" return jsonld.expand(doc, options={'documentLoader': _document_loader}) diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py index 00bef4a..629974a 100644 --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -1,58 +1,60 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.indexer.codemeta import compact, expand, CODEMETA_URI +from swh.indexer.codemeta import compact, expand +from swh.indexer.codemeta import make_absolute_uri from swh.indexer.metadata_dictionary import MAPPINGS def detect_metadata(files): """ Detects files potentially containing metadata Args: - file_entries (list): list of files Returns: - empty list if nothing was found - dictionary {mapping_filenames[name]:f['sha1']} """ results = {} for (mapping_name, mapping) in MAPPINGS.items(): matches = mapping.detect_metadata_files(files) if matches: results[mapping_name] = matches return results _MINIMAL_PROPERTY_SET = { "developmentStatus", "version", "operatingSystem", "description", "keywords", "issueTracker", "name", "author", "relatedLink", "url", "license", "maintainer", "email", "identifier", "codeRepository"} -MINIMAL_METADATA_SET = {CODEMETA_URI+prop for prop in _MINIMAL_PROPERTY_SET} +MINIMAL_METADATA_SET = {make_absolute_uri(prop) + for prop in _MINIMAL_PROPERTY_SET} def extract_minimal_metadata_dict(metadata_list): """ Every item in the metadata_list is a dict of translated_metadata in the CodeMeta vocabulary we wish to extract a minimal set of terms and keep all values corresponding to this term without duplication Args: - metadata_list (list): list of dicts of translated_metadata Returns: - minimal_dict (dict): one dict with selected values of metadata """ minimal_dict = {} for document in metadata_list: for metadata_item in expand(document): for (term, value) in metadata_item.items(): if term in MINIMAL_METADATA_SET: if term not in minimal_dict: minimal_dict[term] = [value] elif value not in minimal_dict[term]: minimal_dict[term].append(value) return compact(minimal_dict) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index c2cd7eb..b8e01b9 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,230 +1,284 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os import re import abc import json import logging +import xmltodict -from swh.indexer.codemeta import CROSSWALK_TABLE, CODEMETA_URI, compact +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import compact, expand MAPPINGS = {} def register_mapping(cls): MAPPINGS[cls.__name__] = cls() return cls class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - override translate function """ def __init__(self): self.log = logging.getLogger('%s.%s' % ( self.__class__.__module__, self.__class__.__name__)) @abc.abstractmethod def detect_metadata_files(self, files): """ Detects files potentially containing metadata Args: - file_entries (list): list of files Returns: - empty list if nothing was found - list of sha1 otherwise """ pass @abc.abstractmethod def translate(self, file_content): pass def normalize_translation(self, metadata): return compact(metadata) +class SingleFileMapping(BaseMapping): + """Base class for all mappings that use a single file as input.""" + + @property + @abc.abstractmethod + def filename(self): + """The .json file to extract metadata from.""" + pass + + def detect_metadata_files(self, file_entries): + for entry in file_entries: + if entry['name'] == self.filename: + return [entry['sha1']] + return [] + + class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" @property @abc.abstractmethod def mapping(self): """A translation dict to map dict keys into a canonical name.""" pass - def translate_dict(self, content_dict): + def translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object and translating with the appropriate mapping Args: content_dict (dict) Returns: dict: translated metadata in json-friendly form needed for the indexer """ - translated_metadata = {} + translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key translation_method = getattr(self, 'translate_' + k, None) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table # if there is a normalization method, use it on the value normalization_method = getattr(self, 'normalize_' + k, None) if normalization_method: v = normalization_method(v) # set the translation metadata with the normalized value translated_metadata[self.mapping[k]] = v - return self.normalize_translation(translated_metadata) + if normalize: + return self.normalize_translation(translated_metadata) + else: + return translated_metadata -class JsonMapping(DictMapping): +class JsonMapping(DictMapping, SingleFileMapping): """Base class for all mappings that use a JSON file as input.""" - @property - @abc.abstractmethod - def filename(self): - """The .json file to extract metadata from.""" - pass - - def detect_metadata_files(self, file_entries): - for entry in file_entries: - if entry['name'] == self.filename: - return [entry['sha1']] - return [] - def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing json data and translating with the appropriate mapping Args: raw_content: bytes Returns: dict: translated metadata in json-friendly form needed for the indexer """ try: raw_content = raw_content.decode() except UnicodeDecodeError: self.log.warning('Error unidecoding %r', raw_content) return try: content_dict = json.loads(raw_content) except json.JSONDecodeError: self.log.warning('Error unjsoning %r' % raw_content) return return self.translate_dict(content_dict) @register_mapping class NpmMapping(JsonMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' _schema_shortcuts = { 'github': 'https://github.com/', 'gist': 'https://gist.github.com/', 'bitbucket': 'https://bitbucket.org/', 'gitlab': 'https://gitlab.com/', } def normalize_repository(self, d): """https://docs.npmjs.com/files/package.json#repository""" if isinstance(d, dict): return '{type}+{url}'.format(**d) elif isinstance(d, str): if '://' in d: return d elif ':' in d: (schema, rest) = d.split(':', 1) if schema in self._schema_shortcuts: return self._schema_shortcuts[schema] + rest else: return None else: return self._schema_shortcuts['github'] + d else: return None def normalize_bugs(self, d): return '{url}'.format(**d) _parse_author = re.compile(r'^ *' r'(?P.*?)' r'( +<(?P.*)>)?' r'( +\((?P.*)\))?' r' *$') def normalize_author(self, d): 'https://docs.npmjs.com/files/package.json' \ '#people-fields-author-contributors' - author = {'@type': CODEMETA_URI+'Person'} + author = {'@type': SCHEMA_URI+'Person'} if isinstance(d, dict): name = d.get('name', None) email = d.get('email', None) url = d.get('url', None) elif isinstance(d, str): match = self._parse_author.match(d) name = match.group('name') email = match.group('email') url = match.group('url') else: return None if name: - author[CODEMETA_URI+'name'] = name + author[SCHEMA_URI+'name'] = name if email: - author[CODEMETA_URI+'email'] = email + author[SCHEMA_URI+'email'] = email if url: - author[CODEMETA_URI+'url'] = url + author[SCHEMA_URI+'url'] = url return author @register_mapping -class CodemetaMapping(JsonMapping): +class CodemetaMapping(SingleFileMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ - mapping = CROSSWALK_TABLE['codemeta-V1'] filename = b'codemeta.json' + def translate(self, content): + return self.normalize_translation(expand(json.loads(content.decode()))) + + +@register_mapping +class MavenMapping(DictMapping, SingleFileMapping): + """ + dedicated class for Maven (pom.xml) mapping and translation + """ + filename = b'pom.xml' + mapping = CROSSWALK_TABLE['Java (Maven)'] + + def translate(self, content): + d = xmltodict.parse(content)['project'] + metadata = self.translate_dict(d, normalize=False) + metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) + return self.normalize_translation(metadata) + + _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} + + def parse_repositories(self, d): + """https://maven.apache.org/pom.html#Repositories""" + if 'repositories' not in d: + return [self.parse_repository(d, self._default_repository)] + else: + repositories = d['repositories'].get('repository', []) + if not isinstance(repositories, list): + repositories = [repositories] + results = [] + for repo in repositories: + res = self.parse_repository(d, repo) + if res: + results.append(res) + return results + + def parse_repository(self, d, repo): + if repo.get('layout', 'default') != 'default': + return # TODO ? + url = repo['url'] + if d['groupId']: + url = os.path.join(url, *d['groupId'].split('.')) + if d['artifactId']: + url = os.path.join(url, d['artifactId']) + return url + def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" result = MAPPINGS["NpmMapping"].translate(raw_content) result1 = MAPPINGS["MavenMapping"].translate(raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 7e78f92..657f842 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,332 +1,480 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.metadata import ContentMetadataIndexer from swh.indexer.metadata import RevisionMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage, MockStorage from swh.indexer.tests.test_utils import MockIndexerStorage class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.destination_task = None self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = ContentMetadataTestIndexer def prepare(self): self.config = { 'storage': { 'cls': 'remote', 'args': { 'url': 'http://localhost:9999', } }, 'tools': { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } } self.storage = MockStorage() self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.destination_task = None self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.content_tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } MockIndexerStorage.added_data = [] def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { - 'repository': 'https://codemeta.github.io/terms/codeRepository', - 'os': 'https://codemeta.github.io/terms/operatingSystem', - 'cpu': 'https://codemeta.github.io/terms/processorRequirements', + 'repository': 'http://schema.org/codeRepository', + 'os': 'http://schema.org/operatingSystem', + 'cpu': 'http://schema.org/processorRequirements', 'engines': - 'https://codemeta.github.io/terms/processorRequirements', - 'author': 'https://codemeta.github.io/terms/author', - 'author.email': 'https://codemeta.github.io/terms/email', - 'author.name': 'https://codemeta.github.io/terms/name', - 'contributor': 'https://codemeta.github.io/terms/contributor', - 'keywords': 'https://codemeta.github.io/terms/keywords', - 'license': 'https://codemeta.github.io/terms/license', - 'version': 'https://codemeta.github.io/terms/version', - 'description': 'https://codemeta.github.io/terms/description', - 'name': 'https://codemeta.github.io/terms/name', + 'http://schema.org/processorRequirements', + 'author': 'http://schema.org/author', + 'author.email': 'http://schema.org/email', + 'author.name': 'http://schema.org/name', + 'contributor': 'http://schema.org/contributor', + 'keywords': 'http://schema.org/keywords', + 'license': 'http://schema.org/license', + 'version': 'http://schema.org/version', + 'description': 'http://schema.org/description', + 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', - 'homepage': 'https://codemeta.github.io/terms/url' + 'homepage': 'http://schema.org/url' }) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:name': 'test_metadata', - 'codemeta:version': '0.0.2', - 'codemeta:description': 'Simple package.json test for indexer', - 'codemeta:codeRepository': + 'type': 'SoftwareSourceCode', + 'name': 'test_metadata', + 'version': '0.0.2', + 'description': 'Simple package.json test for indexer', + 'schema:codeRepository': 'git+https://github.com/moranegg/metadata_test', - 'codemeta:author': { - 'type': 'codemeta:Person', - 'codemeta:name': 'Morane G', - 'codemeta:email': 'moranegg@example.com', + 'schema:author': { + 'type': 'Person', + 'name': 'Morane G', + 'email': 'moranegg@example.com', }, } # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:name': 'test_1', - 'codemeta:version': '0.0.2', - 'codemeta:description': 'Simple package.json test for indexer', - 'codemeta:codeRepository': + 'name': 'test_1', + 'version': '0.0.2', + 'description': 'Simple package.json test for indexer', + 'schema:codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:name': 'test_0_1', - 'codemeta:version': '0.0.2', - 'codemeta:description': 'Simple package.json test for indexer', - 'codemeta:codeRepository': + 'name': 'test_0_1', + 'version': '0.0.2', + 'description': 'Simple package.json test for indexer', + 'schema:codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:name': 'test_metadata', - 'codemeta:version': '0.0.2', - 'codemeta:author': 'moranegg', + 'name': 'test_metadata', + 'version': '0.0.2', + 'schema:author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - "codemeta:version": '0.0.2', - "codemeta:description": 'Simple package.json test for indexer', - "codemeta:name": ['test_1', 'test_0_1', 'test_metadata'], - "codemeta:author": 'moranegg', - "codemeta:codeRepository": + "version": '0.0.2', + "description": 'Simple package.json test for indexer', + "name": ['test_1', 'test_0_1', 'test_metadata'], + "schema:author": 'moranegg', + "schema:codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', 'd4c647f0fc257591cc9ba1722484229780d1c607', '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = ContentMetadataTestIndexer( tool=self.content_tool, config={}) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = metadata_indexer.idx_storage.added_data expected_results = [('content_metadata', False, [{ 'indexer_configuration_id': 30, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:codeRepository': + 'type': 'SoftwareSourceCode', + 'schema:codeRepository': 'git+https://github.com/moranegg/metadata_test', - 'codemeta:description': 'Simple package.json test for indexer', - 'codemeta:name': 'test_metadata', - 'codemeta:version': '0.0.1' + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/npm/npm/issues', - 'codemeta:author': { - 'type': 'codemeta:Person', - 'codemeta:name': 'Isaac Z. Schlueter', - 'codemeta:email': 'i@izs.me', - 'codemeta:url': 'http://blog.izs.me', + 'schema:author': { + 'type': 'Person', + 'name': 'Isaac Z. Schlueter', + 'email': 'i@izs.me', + 'schema:url': 'http://blog.izs.me', }, - 'codemeta:codeRepository': + 'schema:codeRepository': 'git+https://github.com/npm/npm', - 'codemeta:description': 'a package manager for JavaScript', - 'codemeta:license': 'Artistic-2.0', - 'codemeta:version': '5.0.3', - 'codemeta:name': 'npm', - 'codemeta:keywords': [ + 'description': 'a package manager for JavaScript', + 'schema:license': 'Artistic-2.0', + 'version': '5.0.3', + 'name': 'npm', + 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], - 'codemeta:url': 'https://docs.npmjs.com/' + 'schema:url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { 'indexer_configuration_id': 30, 'translated_metadata': None, 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' }])] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) + def test_compute_metadata_valid_codemeta(self): + raw_content = ( + b"""{ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "@type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", + "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "2.0", + "author": [ + { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "@type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "@id": "http://orcid.org/0000-0003-0077-4738" + } + ], + "maintainer": { + "@type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "@id": "http://orcid.org/0000-0002-1642-628X" + }, + "contIntegration": "https://travis-ci.org/codemeta/codemeta", + "developmentStatus": "active", + "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", + "funder": { + "@id": "https://doi.org/10.13039/100000001", + "@type": "Organization", + "name": "National Science Foundation" + }, + "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", + "keywords": [ + "metadata", + "software" + ], + "version":"2.0", + "dateCreated":"2017-06-05", + "datePublished":"2017-06-05", + "programmingLanguage": "JSON-LD" + }""") # noqa + expected_result = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "identifier": "CodeMeta", + "description": + "CodeMeta is a concept vocabulary that can " + "be used to standardize the exchange of software metadata " + "across repositories and organizations.", + "name": + "CodeMeta: Minimal metadata schemas for science " + "software and code, in JSON-LD", + "codeRepository": "https://github.com/codemeta/codemeta", + "issueTracker": "https://github.com/codemeta/codemeta/issues", + "license": "https://spdx.org/licenses/Apache-2.0", + "version": "2.0", + "author": [ + { + "type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "id": "http://orcid.org/0000-0002-1642-628X" + }, + { + "type": "Person", + "givenName": "Matthew B.", + "familyName": "Jones", + "email": "jones@nceas.ucsb.edu", + "id": "http://orcid.org/0000-0003-0077-4738" + } + ], + "maintainer": { + "type": "Person", + "givenName": "Carl", + "familyName": "Boettiger", + "email": "cboettig@gmail.com", + "id": "http://orcid.org/0000-0002-1642-628X" + }, + "contIntegration": "https://travis-ci.org/codemeta/codemeta", + "developmentStatus": "active", + "downloadUrl": + "https://github.com/codemeta/codemeta/archive/2.0.zip", + "funder": { + "id": "https://doi.org/10.13039/100000001", + "type": "Organization", + "name": "National Science Foundation" + }, + "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " + "in Scientific Software", + "keywords": [ + "metadata", + "software" + ], + "version": "2.0", + "dateCreated": "2017-06-05", + "datePublished": "2017-06-05", + "programmingLanguage": "JSON-LD" + } + result = MAPPINGS["CodemetaMapping"].translate(raw_content) + self.assertEqual(result, expected_result) + + def test_compute_metadata_maven(self): + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + + central + Maven Repository Switchboard + default + http://repo1.maven.org/maven2 + + false + + + + """ + result = MAPPINGS["MavenMapping"].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'name': 'Maven Default Project', + 'schema:identifier': 'com.mycompany.app', + 'version': '1.2.3', + 'schema:codeRepository': + 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', + }) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() sha1_gits = [ b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', ] metadata_indexer.run(sha1_gits, 'update-dups') results = metadata_indexer.idx_storage.added_data expected_results = [('revision_metadata', True, [{ 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:url': + 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'codemeta:codeRepository': + 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', - 'codemeta:author': 'Andrew Nesbitt', - 'codemeta:license': 'AGPL-3.0', - 'codemeta:version': '1.0.0', - 'codemeta:description': + 'schema:author': 'Andrew Nesbitt', + 'license': 'AGPL-3.0', + 'version': '1.0.0', + 'description': 'Tiny web service for parsing yarn.lock files', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', - 'codemeta:name': 'yarn-parser', - 'codemeta:keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, 'indexer_configuration_id': 7 }])] # then self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index b5401d1..375c42e 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,127 +1,127 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time import logging import unittest from celery import task from swh.indexer.metadata import OriginMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage, MockStorage from swh.indexer.tests.test_utils import MockIndexerStorage from swh.indexer.tests.test_origin_head import OriginHeadTestIndexer from swh.indexer.tests.test_metadata import RevisionMetadataTestIndexer from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture class OriginMetadataTestIndexer(OriginMetadataIndexer): def prepare(self): self.config = { 'storage': { 'cls': 'remote', 'args': { 'url': 'http://localhost:9999', } }, 'tools': { 'name': 'origin-metadata', 'version': '0.0.1', 'configuration': {} } } self.storage = MockStorage() self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.destination_task = None self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] @task def revision_metadata_test_task(*args, **kwargs): indexer = RevisionMetadataTestIndexer() indexer.run(*args, **kwargs) return indexer.results @task def origin_intrinsic_metadata_test_task(*args, **kwargs): indexer = OriginMetadataTestIndexer() indexer.run(*args, **kwargs) return indexer.results class OriginHeadTestIndexer(OriginHeadTestIndexer): revision_metadata_task = 'revision_metadata_test_task' origin_intrinsic_metadata_task = 'origin_intrinsic_metadata_test_task' class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase): def setUp(self): super().setUp() self.maxDiff = None MockIndexerStorage.added_data = [] self.add_scheduler_task_type( 'revision_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' 'revision_metadata_test_task') self.add_scheduler_task_type( 'origin_intrinsic_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' 'origin_intrinsic_metadata_test_task') RevisionMetadataTestIndexer.scheduler = self.scheduler def tearDown(self): del RevisionMetadataTestIndexer.scheduler super().tearDown() def test_pipeline(self): indexer = OriginHeadTestIndexer() indexer.scheduler = self.scheduler indexer.run( ["git+https://github.com/librariesio/yarn-parser"], policy_update='update-dups', parse_ids=True) self.run_ready_tasks() # Run the first task time.sleep(0.1) # Give it time to complete and schedule the 2nd one self.run_ready_tasks() # Run the second task metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'codemeta:url': + 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'codemeta:codeRepository': + 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', - 'codemeta:author': 'Andrew Nesbitt', - 'codemeta:license': 'AGPL-3.0', - 'codemeta:version': '1.0.0', - 'codemeta:description': + 'schema:author': 'Andrew Nesbitt', + 'license': 'AGPL-3.0', + 'version': '1.0.0', + 'description': 'Tiny web service for parsing yarn.lock files', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', - 'codemeta:name': 'yarn-parser', - 'codemeta:keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'name': 'yarn-parser', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } rev_metadata = { 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': metadata, 'indexer_configuration_id': 7, } origin_metadata = { 'origin_id': 54974445, 'from_revision': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'metadata': metadata, 'indexer_configuration_id': 7, } expected_results = [ ('origin_intrinsic_metadata', True, [origin_metadata]), ('revision_metadata', True, [rev_metadata])] results = list(indexer.idx_storage.added_data) self.assertCountEqual(expected_results, results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index 3be03f7..8dc958c 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,399 +1,400 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.exc import ObjNotFoundError ORIGINS = [ { 'id': 52189575, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, { 'id': 4423668, 'lister': None, 'project': None, 'type': 'ftp', 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, { 'id': 77775770, 'lister': None, 'project': None, 'type': 'deposit', 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, { 'id': 85072327, 'lister': None, 'project': None, 'type': 'pypi', 'url': 'https://pypi.org/project/limnoria/'}, { 'id': 49908349, 'lister': None, 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, { 'id': 54974445, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}, ] SNAPSHOTS = { 52189575: { 'branches': { b'refs/heads/add-revision-origin-cache': { 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' b's\xe7/\xe9l\x1e', 'target_type': 'revision'}, b'HEAD': { 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' b'\xac\xefrm', 'target_type': 'revision'}, b'refs/tags/v0.0.103': { 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b'\x0f\xdd', 'target_type': 'release'}, }}, 4423668: { 'branches': { b'3DLDF-1.1.4.tar.gz': { 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' b'"G\x99\x11', 'target_type': 'revision'}, b'3DLDF-2.0.2.tar.gz': { 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', 'target_type': 'revision'}, b'3DLDF-2.0.3-examples.tar.gz': { 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' b'\xfe\xadZ\x80\x80\xc1\x83\xff', 'target_type': 'revision'}, b'3DLDF-2.0.3.tar.gz': { 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', 'target_type': 'revision'}, b'3DLDF-2.0.tar.gz': { 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' b'\xd3\xd1m', b'target_type': 'revision'} }}, 77775770: { 'branches': { b'master': { 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', 'target_type': 'revision'} }, 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r "}, 85072327: { 'branches': { b'HEAD': { 'target': b'releases/2018.09.09', 'target_type': 'alias'}, b'releases/2018.09.01': { 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' b'\xbb\xdfF\xfdw\xcf', 'target_type': 'revision'}, b'releases/2018.09.09': { 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', 'target_type': 'revision'}}, 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' b'\x12\x9e\xd6\xb3'}, 49908349: { 'branches': { b'master': { 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', 'target_type': 'revision'}}, 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' b'\x05\xea\xb8\x1f\xc4H\xf4s'}, 54974445: { 'branches': { b'HEAD': { 'target': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'target_type': 'revision'}}} } class MockObjStorage: """Mock an swh-objstorage objstorage with predefined contents. """ data = {} def __init__(self): self.data = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', } def __iter__(self): yield from self.data.keys() def __contains__(self, sha1): return self.data.get(sha1) is not None def get(self, sha1): raw_content = self.data.get(sha1) if raw_content is None: raise ObjNotFoundError(sha1) return raw_content class MockIndexerStorage(): """Mock an swh-indexer storage. """ added_data = [] def indexer_configuration_add(self, tools): tool = tools[0] if tool['tool_name'] == 'swh-metadata-translator': return [{ 'id': 30, 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'NpmMapping' }, }] elif tool['tool_name'] == 'swh-metadata-detector': return [{ 'id': 7, 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'NpmMapping' }, }] elif tool['tool_name'] == 'origin-metadata': return [{ 'id': 8, 'tool_name': 'origin-metadata', 'tool_version': '0.0.1', 'tool_configuration': {}, }] else: assert False, 'Unknown tool {tool_name}'.format(**tool) def content_metadata_missing(self, sha1s): yield from [] def content_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('content_metadata', conflict_update, metadata)) def revision_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('revision_metadata', conflict_update, metadata)) def origin_intrinsic_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('origin_intrinsic_metadata', conflict_update, metadata)) def content_metadata_get(self, sha1s): return [{ 'tool': { 'configuration': { 'type': 'local', 'context': 'NpmMapping' }, 'version': '0.0.1', 'id': 6, 'name': 'swh-metadata-translator' }, 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', - 'codemeta:version': '1.0.0', - 'codemeta:name': 'yarn-parser', - 'codemeta:author': 'Andrew Nesbitt', - 'codemeta:url': + 'version': '1.0.0', + 'name': 'yarn-parser', + 'schema:author': 'Andrew Nesbitt', + 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'codemeta:processorRequirements': {'node': '7.5'}, - 'codemeta:license': 'AGPL-3.0', - 'codemeta:keywords': ['yarn', 'parse', 'lock', 'dependencies'], - 'codemeta:codeRepository': + 'processorRequirements': {'node': '7.5'}, + 'license': 'AGPL-3.0', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', - 'codemeta:description': + 'description': 'Tiny web service for parsing yarn.lock files', } }] class MockStorage(): """Mock a real swh-storage storage to simplify reading indexers' outputs. """ def origin_get(self, id_): for origin in ORIGINS: for (k, v) in id_.items(): if origin[k] != v: break else: # This block is run iff we didn't break, ie. if all supplied # parts of the id are set to the expected value. return origin assert False, id_ def snapshot_get_latest(self, origin_id): if origin_id in SNAPSHOTS: return SNAPSHOTS[origin_id] else: assert False, origin_id def revision_get(self, revisions): return [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] def directory_ls(self, directory, recursive=False, cur=None): # with directory: b'\x9d', return [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'cde' }, { 'dir_id': b'10', 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None }]