diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
index 756cb4f..90e863b 100644
--- a/swh/indexer/codemeta.py
+++ b/swh/indexer/codemeta.py
@@ -1,158 +1,192 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import collections
import csv
import itertools
import json
import os.path
import re
import swh.indexer
from pyld import jsonld
_DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), 'data')
CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, 'codemeta', 'crosswalk.csv')
CODEMETA_CONTEXT_PATH = os.path.join(_DATA_DIR, 'codemeta', 'codemeta.jsonld')
with open(CODEMETA_CONTEXT_PATH) as fd:
CODEMETA_CONTEXT = json.load(fd)
CODEMETA_CONTEXT_URL = 'https://doi.org/10.5063/schema/codemeta-2.0'
CODEMETA_ALTERNATE_CONTEXT_URLS = {
('https://raw.githubusercontent.com/codemeta/codemeta/'
'master/codemeta.jsonld')
}
CODEMETA_URI = 'https://codemeta.github.io/terms/'
SCHEMA_URI = 'http://schema.org/'
PROPERTY_BLACKLIST = {
# CodeMeta properties that we cannot properly represent.
SCHEMA_URI + 'softwareRequirements',
CODEMETA_URI + 'softwareSuggestions',
# Duplicate of 'author'
SCHEMA_URI + 'creator',
}
_codemeta_field_separator = re.compile(r'\s*[,/]\s*')
def make_absolute_uri(local_name):
definition = CODEMETA_CONTEXT['@context'][local_name]
if isinstance(definition, str):
return definition
elif isinstance(definition, dict):
prefixed_name = definition['@id']
(prefix, local_name) = prefixed_name.split(':')
if prefix == 'schema':
canonical_name = SCHEMA_URI + local_name
elif prefix == 'codemeta':
canonical_name = CODEMETA_URI + local_name
else:
assert False, prefix
return canonical_name
else:
assert False, definition
def _read_crosstable(fd):
reader = csv.reader(fd)
try:
header = next(reader)
except StopIteration:
raise ValueError('empty file')
data_sources = set(header) - {'Parent Type', 'Property',
'Type', 'Description'}
assert 'codemeta-V1' in data_sources
codemeta_translation = {data_source: {} for data_source in data_sources}
terms = set()
for line in reader: # For each canonical name
local_name = dict(zip(header, line))['Property']
if not local_name:
continue
canonical_name = make_absolute_uri(local_name)
if canonical_name in PROPERTY_BLACKLIST:
continue
terms.add(canonical_name)
for (col, value) in zip(header, line): # For each cell in the row
if col in data_sources:
# If that's not the parentType/property/type/description
for local_name in _codemeta_field_separator.split(value):
# For each of the data source's properties that maps
# to this canonical name
if local_name.strip():
codemeta_translation[col][local_name.strip()] = \
canonical_name
return (terms, codemeta_translation)
with open(CROSSWALK_TABLE_PATH) as fd:
(CODEMETA_TERMS, CROSSWALK_TABLE) = _read_crosstable(fd)
def _document_loader(url):
"""Document loader for pyld.
Reads the local codemeta.jsonld file instead of fetching it
from the Internet every single time."""
if url == CODEMETA_CONTEXT_URL or url in CODEMETA_ALTERNATE_CONTEXT_URLS:
return {
'contextUrl': None,
'documentUrl': url,
'document': CODEMETA_CONTEXT,
}
elif url == CODEMETA_URI:
raise Exception('{} is CodeMeta\'s URI, use {} as context url'.format(
CODEMETA_URI, CODEMETA_CONTEXT_URL))
else:
raise Exception(url)
def compact(doc):
"""Same as `pyld.jsonld.compact`, but in the context of CodeMeta."""
return jsonld.compact(doc, CODEMETA_CONTEXT_URL,
options={'documentLoader': _document_loader})
def expand(doc):
"""Same as `pyld.jsonld.expand`, but in the context of CodeMeta."""
return jsonld.expand(doc,
options={'documentLoader': _document_loader})
+def merge_values(v1, v2):
+ """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`,
+ returns `{"@list": l1 + l2}`.
+ Otherwise, make them lists (if they are not already) and concatenate
+ them.
+
+ >>> merge_values('a', 'b')
+ ['a', 'b']
+ >>> merge_values(['a', 'b'], 'c')
+ ['a', 'b', 'c']
+ >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']})
+ {'@list': ['a', 'b', 'c']}
+ """
+ if v1 is None:
+ return v2
+ elif v2 is None:
+ return v1
+ elif isinstance(v1, dict) and set(v1) == {'@list'}:
+ assert isinstance(v1['@list'], list)
+ if isinstance(v2, dict) and set(v2) == {'@list'}:
+ assert isinstance(v2['@list'], list)
+ return {'@list': v1['@list'] + v2['@list']}
+ else:
+ raise ValueError('Cannot merge %r and %r' % (v1, v2))
+ else:
+ if isinstance(v2, dict) and '@list' in v2:
+ raise ValueError('Cannot merge %r and %r' % (v1, v2))
+ if not isinstance(v1, list):
+ v1 = [v1]
+ if not isinstance(v2, list):
+ v2 = [v2]
+ return v1 + v2
+
+
def merge_documents(documents):
"""Takes a list of metadata dicts, each generated from a different
metadata file, and merges them.
Removes duplicates, if any."""
documents = list(itertools.chain.from_iterable(map(expand, documents)))
merged_document = collections.defaultdict(list)
for document in documents:
for (key, values) in document.items():
if key == '@id':
# @id does not get expanded to a list
value = values
# Only one @id is allowed, move it to sameAs
if '@id' not in merged_document:
merged_document['@id'] = value
elif value != merged_document['@id']:
if value not in merged_document[SCHEMA_URI + 'sameAs']:
merged_document[SCHEMA_URI + 'sameAs'].append(value)
else:
for value in values:
if value not in merged_document[key]:
merged_document[key].append(value)
return compact(merged_document)
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
index 4276dd2..e01276e 100644
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -1,213 +1,179 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import json
import logging
from typing import List
from swh.indexer.codemeta import SCHEMA_URI
-from swh.indexer.codemeta import compact
-
-
-def merge_values(v1, v2):
- """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`,
- returns `{"@list": l1 + l2}`.
- Otherwise, make them lists (if they are not already) and concatenate
- them.
-
- >>> merge_values('a', 'b')
- ['a', 'b']
- >>> merge_values(['a', 'b'], 'c')
- ['a', 'b', 'c']
- >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']})
- {'@list': ['a', 'b', 'c']}
- """
- if v1 is None:
- return v2
- elif v2 is None:
- return v1
- elif isinstance(v1, dict) and set(v1) == {'@list'}:
- assert isinstance(v1['@list'], list)
- if isinstance(v2, dict) and set(v2) == {'@list'}:
- assert isinstance(v2['@list'], list)
- return {'@list': v1['@list'] + v2['@list']}
- else:
- raise ValueError('Cannot merge %r and %r' % (v1, v2))
- else:
- if isinstance(v2, dict) and '@list' in v2:
- raise ValueError('Cannot merge %r and %r' % (v1, v2))
- if not isinstance(v1, list):
- v1 = [v1]
- if not isinstance(v2, list):
- v2 = [v2]
- return v1 + v2
+from swh.indexer.codemeta import compact, merge_values
class BaseMapping(metaclass=abc.ABCMeta):
"""Base class for mappings to inherit from
To implement a new mapping:
- inherit this class
- override translate function
"""
def __init__(self, log_suffix=''):
self.log_suffix = log_suffix
self.log = logging.getLogger('%s.%s' % (
self.__class__.__module__,
self.__class__.__name__))
@property
@abc.abstractmethod
def name(self):
"""A name of this mapping, used as an identifier in the
indexer storage."""
pass
@classmethod
@abc.abstractmethod
def detect_metadata_files(cls, files):
"""
Detects files potentially containing metadata
Args:
file_entries (list): list of files
Returns:
list: list of sha1 (possibly empty)
"""
pass
@abc.abstractmethod
def translate(self, file_content):
pass
def normalize_translation(self, metadata):
return compact(metadata)
class SingleFileMapping(BaseMapping):
"""Base class for all mappings that use a single file as input."""
@property
@abc.abstractmethod
def filename(self):
"""The .json file to extract metadata from."""
pass
@classmethod
def detect_metadata_files(cls, file_entries):
for entry in file_entries:
if entry['name'] == cls.filename:
return [entry['sha1']]
return []
class DictMapping(BaseMapping):
"""Base class for mappings that take as input a file that is mostly
a key-value store (eg. a shallow JSON dict)."""
string_fields = [] # type: List[str]
'''List of fields that are simple strings, and don't need any
normalization.'''
@property
@abc.abstractmethod
def mapping(self):
"""A translation dict to map dict keys into a canonical name."""
pass
@staticmethod
def _normalize_method_name(name):
return name.replace('-', '_')
@classmethod
def supported_terms(cls):
return {
term for (key, term) in cls.mapping.items()
if key in cls.string_fields
or hasattr(cls, 'translate_' + cls._normalize_method_name(key))
or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))}
def _translate_dict(self, content_dict, *, normalize=True):
"""
Translates content by parsing content from a dict object
and translating with the appropriate mapping
Args:
content_dict (dict): content dict to translate
Returns:
dict: translated metadata in json-friendly form needed for
the indexer
"""
translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'}
for k, v in content_dict.items():
# First, check if there is a specific translation
# method for this key
translation_method = getattr(
self, 'translate_' + self._normalize_method_name(k), None)
if translation_method:
translation_method(translated_metadata, v)
elif k in self.mapping:
# if there is no method, but the key is known from the
# crosswalk table
codemeta_key = self.mapping[k]
# if there is a normalization method, use it on the value
normalization_method = getattr(
self, 'normalize_' + self._normalize_method_name(k), None)
if normalization_method:
v = normalization_method(v)
elif k in self.string_fields and isinstance(v, str):
pass
elif k in self.string_fields and isinstance(v, list):
v = [x for x in v if isinstance(x, str)]
else:
continue
# set the translation metadata with the normalized value
if codemeta_key in translated_metadata:
translated_metadata[codemeta_key] = merge_values(
translated_metadata[codemeta_key], v)
else:
translated_metadata[codemeta_key] = v
if normalize:
return self.normalize_translation(translated_metadata)
else:
return translated_metadata
class JsonMapping(DictMapping, SingleFileMapping):
"""Base class for all mappings that use a JSON file as input."""
def translate(self, raw_content):
"""
Translates content by parsing content from a bytestring containing
json data and translating with the appropriate mapping
Args:
raw_content (bytes): raw content to translate
Returns:
dict: translated metadata in json-friendly form needed for
the indexer
"""
try:
raw_content = raw_content.decode()
except UnicodeDecodeError:
self.log.warning('Error unidecoding from %s', self.log_suffix)
return
try:
content_dict = json.loads(raw_content)
except json.JSONDecodeError:
self.log.warning('Error unjsoning from %s', self.log_suffix)
return
if isinstance(content_dict, dict):
return self._translate_dict(content_dict)
diff --git a/swh/indexer/tests/test_codemeta.py b/swh/indexer/tests/test_codemeta.py
new file mode 100644
index 0000000..e5ba00e
--- /dev/null
+++ b/swh/indexer/tests/test_codemeta.py
@@ -0,0 +1,158 @@
+# Copyright (C) 2018-2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
+from swh.indexer.codemeta import merge_documents, merge_values
+from swh.indexer.codemeta import CROSSWALK_TABLE
+
+
+def test_crosstable():
+ assert CROSSWALK_TABLE['NodeJS'] == {
+ 'repository': 'http://schema.org/codeRepository',
+ 'os': 'http://schema.org/operatingSystem',
+ 'cpu': 'http://schema.org/processorRequirements',
+ 'engines':
+ 'http://schema.org/processorRequirements',
+ 'author': 'http://schema.org/author',
+ 'author.email': 'http://schema.org/email',
+ 'author.name': 'http://schema.org/name',
+ 'contributor': 'http://schema.org/contributor',
+ 'keywords': 'http://schema.org/keywords',
+ 'license': 'http://schema.org/license',
+ 'version': 'http://schema.org/version',
+ 'description': 'http://schema.org/description',
+ 'name': 'http://schema.org/name',
+ 'bugs': 'https://codemeta.github.io/terms/issueTracker',
+ 'homepage': 'http://schema.org/url'
+ }
+
+
+def test_merge_values():
+ assert merge_values('a', 'b') == ['a', 'b']
+ assert merge_values(['a', 'b'], 'c') == ['a', 'b', 'c']
+ assert merge_values('a', ['b', 'c']) == ['a', 'b', 'c']
+
+ assert merge_values({'@list': ['a']}, {'@list': ['b']}) \
+ == {'@list': ['a', 'b']}
+ assert merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) \
+ == {'@list': ['a', 'b', 'c']}
+
+ with pytest.raises(ValueError):
+ merge_values({'@list': ['a']}, 'b')
+ with pytest.raises(ValueError):
+ merge_values('a', {'@list': ['b']})
+ with pytest.raises(ValueError):
+ merge_values({'@list': ['a']}, ['b'])
+ with pytest.raises(ValueError):
+ merge_values(['a'], {'@list': ['b']})
+
+ assert merge_values('a', None) == 'a'
+ assert merge_values(['a', 'b'], None) == ['a', 'b']
+ assert merge_values(None, ['b', 'c']) == ['b', 'c']
+ assert merge_values({'@list': ['a']}, None) == {'@list': ['a']}
+ assert merge_values(None, {'@list': ['a']}) == {'@list': ['a']}
+
+
+def test_merge_documents():
+ """
+ Test the creation of a coherent minimal metadata set
+ """
+ # given
+ metadata_list = [{
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'name': 'test_1',
+ 'version': '0.0.2',
+ 'description': 'Simple package.json test for indexer',
+ 'codeRepository':
+ 'git+https://github.com/moranegg/metadata_test',
+ }, {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'name': 'test_0_1',
+ 'version': '0.0.2',
+ 'description': 'Simple package.json test for indexer',
+ 'codeRepository':
+ 'git+https://github.com/moranegg/metadata_test'
+ }, {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'name': 'test_metadata',
+ 'version': '0.0.2',
+ 'author': {
+ 'type': 'Person',
+ 'name': 'moranegg',
+ },
+ }]
+
+ # when
+ results = merge_documents(metadata_list)
+
+ # then
+ expected_results = {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ "version": '0.0.2',
+ "description": 'Simple package.json test for indexer',
+ "name": ['test_1', 'test_0_1', 'test_metadata'],
+ "author": [{
+ 'type': 'Person',
+ 'name': 'moranegg'
+ }],
+ "codeRepository":
+ 'git+https://github.com/moranegg/metadata_test',
+ }
+ assert results == expected_results
+
+
+def test_merge_documents_ids():
+ # given
+ metadata_list = [{
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'id': 'http://example.org/test1',
+ 'name': 'test_1',
+ }, {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'id': 'http://example.org/test2',
+ 'name': 'test_2',
+ }]
+
+ # when
+ results = merge_documents(metadata_list)
+
+ # then
+ expected_results = {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'id': 'http://example.org/test1',
+ 'schema:sameAs': 'http://example.org/test2',
+ "name": ['test_1', 'test_2']
+ }
+ assert results == expected_results
+
+
+def test_merge_documents_duplicate_ids():
+ # given
+ metadata_list = [{
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'id': 'http://example.org/test1',
+ 'name': 'test_1',
+ }, {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'id': 'http://example.org/test1',
+ 'name': 'test_1b',
+ }, {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'id': 'http://example.org/test2',
+ 'name': 'test_2',
+ }]
+
+ # when
+ results = merge_documents(metadata_list)
+
+ # then
+ expected_results = {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'id': 'http://example.org/test1',
+ 'schema:sameAs': 'http://example.org/test2',
+ "name": ['test_1', 'test_1b', 'test_2']
+ }
+ assert results == expected_results
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 3586cc3..02981cc 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,1287 +1,1123 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import unittest
import attr
from hypothesis import given, strategies, settings, HealthCheck
from swh.model.hashutil import hash_to_bytes
-from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE
-from swh.indexer.codemeta import merge_documents
+from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata_dictionary import MAPPINGS
-from swh.indexer.metadata_dictionary.base import merge_values
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
from swh.indexer.metadata_detector import (
detect_metadata
)
from swh.indexer.metadata import (
ContentMetadataIndexer, RevisionMetadataIndexer
)
from .utils import (
BASE_TEST_CONFIG, fill_obj_storage, fill_storage,
YARN_PARSER_METADATA, json_document_strategy,
xml_document_strategy,
)
TRANSLATOR_TOOL = {
'name': 'swh-metadata-translator',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': 'NpmMapping'
}
}
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
assert False, 'should not be called; the rev indexer configures it.'
REVISION_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
'tools': TRANSLATOR_TOOL,
}
class Metadata(unittest.TestCase):
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setUp(self):
"""
shows the entire diff in the results
"""
self.maxDiff = None
self.npm_mapping = MAPPINGS['NpmMapping']()
self.codemeta_mapping = MAPPINGS['CodemetaMapping']()
self.maven_mapping = MAPPINGS['MavenMapping']()
self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']()
self.gemspec_mapping = MAPPINGS['GemspecMapping']()
- def test_crosstable(self):
- self.assertEqual(CROSSWALK_TABLE['NodeJS'], {
- 'repository': 'http://schema.org/codeRepository',
- 'os': 'http://schema.org/operatingSystem',
- 'cpu': 'http://schema.org/processorRequirements',
- 'engines':
- 'http://schema.org/processorRequirements',
- 'author': 'http://schema.org/author',
- 'author.email': 'http://schema.org/email',
- 'author.name': 'http://schema.org/name',
- 'contributor': 'http://schema.org/contributor',
- 'keywords': 'http://schema.org/keywords',
- 'license': 'http://schema.org/license',
- 'version': 'http://schema.org/version',
- 'description': 'http://schema.org/description',
- 'name': 'http://schema.org/name',
- 'bugs': 'https://codemeta.github.io/terms/issueTracker',
- 'homepage': 'http://schema.org/url'
- })
-
- def test_merge_values(self):
- self.assertEqual(
- merge_values('a', 'b'),
- ['a', 'b'])
- self.assertEqual(
- merge_values(['a', 'b'], 'c'),
- ['a', 'b', 'c'])
- self.assertEqual(
- merge_values('a', ['b', 'c']),
- ['a', 'b', 'c'])
-
- self.assertEqual(
- merge_values({'@list': ['a']}, {'@list': ['b']}),
- {'@list': ['a', 'b']})
- self.assertEqual(
- merge_values({'@list': ['a', 'b']}, {'@list': ['c']}),
- {'@list': ['a', 'b', 'c']})
-
- with self.assertRaises(ValueError):
- merge_values({'@list': ['a']}, 'b')
- with self.assertRaises(ValueError):
- merge_values('a', {'@list': ['b']})
- with self.assertRaises(ValueError):
- merge_values({'@list': ['a']}, ['b'])
- with self.assertRaises(ValueError):
- merge_values(['a'], {'@list': ['b']})
-
- self.assertEqual(
- merge_values('a', None),
- 'a')
- self.assertEqual(
- merge_values(['a', 'b'], None),
- ['a', 'b'])
- self.assertEqual(
- merge_values(None, ['b', 'c']),
- ['b', 'c'])
- self.assertEqual(
- merge_values({'@list': ['a']}, None),
- {'@list': ['a']})
- self.assertEqual(
- merge_values(None, {'@list': ['a']}),
- {'@list': ['a']})
-
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'test_metadata',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'codeRepository':
'git+https://github.com/moranegg/metadata_test',
'author': [{
'type': 'Person',
'name': 'Morane G',
'email': 'moranegg@example.com',
}],
}
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
- def test_merge_documents(self):
- """
- Test the creation of a coherent minimal metadata set
- """
- # given
- metadata_list = [{
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'name': 'test_1',
- 'version': '0.0.2',
- 'description': 'Simple package.json test for indexer',
- 'codeRepository':
- 'git+https://github.com/moranegg/metadata_test',
- }, {
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'name': 'test_0_1',
- 'version': '0.0.2',
- 'description': 'Simple package.json test for indexer',
- 'codeRepository':
- 'git+https://github.com/moranegg/metadata_test'
- }, {
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'name': 'test_metadata',
- 'version': '0.0.2',
- 'author': {
- 'type': 'Person',
- 'name': 'moranegg',
- },
- }]
-
- # when
- results = merge_documents(metadata_list)
-
- # then
- expected_results = {
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- "version": '0.0.2',
- "description": 'Simple package.json test for indexer',
- "name": ['test_1', 'test_0_1', 'test_metadata'],
- "author": [{
- 'type': 'Person',
- 'name': 'moranegg'
- }],
- "codeRepository":
- 'git+https://github.com/moranegg/metadata_test',
- }
- self.assertEqual(expected_results, results)
-
- def test_merge_documents_ids(self):
- # given
- metadata_list = [{
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'id': 'http://example.org/test1',
- 'name': 'test_1',
- }, {
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'id': 'http://example.org/test2',
- 'name': 'test_2',
- }]
-
- # when
- results = merge_documents(metadata_list)
-
- # then
- expected_results = {
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'id': 'http://example.org/test1',
- 'schema:sameAs': 'http://example.org/test2',
- "name": ['test_1', 'test_2']
- }
- self.assertEqual(expected_results, results)
-
- def test_merge_documents_duplicate_ids(self):
- # given
- metadata_list = [{
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'id': 'http://example.org/test1',
- 'name': 'test_1',
- }, {
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'id': 'http://example.org/test1',
- 'name': 'test_1b',
- }, {
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'id': 'http://example.org/test2',
- 'name': 'test_2',
- }]
-
- # when
- results = merge_documents(metadata_list)
-
- # then
- expected_results = {
- '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
- 'id': 'http://example.org/test1',
- 'schema:sameAs': 'http://example.org/test2',
- "name": ['test_1', 'test_1b', 'test_2']
- }
- self.assertEqual(expected_results, results)
-
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = [
hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'),
hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'),
hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'),
]
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
config = BASE_TEST_CONFIG.copy()
config['tools'] = [TRANSLATOR_TOOL]
metadata_indexer = ContentMetadataTestIndexer(config=config)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# when
metadata_indexer.run(sha1s, policy_update='ignore-dups')
results = list(metadata_indexer.idx_storage.content_metadata_get(
sha1s))
expected_results = [{
'metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'codeRepository':
'git+https://github.com/moranegg/metadata_test',
'description': 'Simple package.json test for indexer',
'name': 'test_metadata',
'version': '0.0.1'
},
'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'),
}, {
'metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'issueTracker':
'https://github.com/npm/npm/issues',
'author': [{
'type': 'Person',
'name': 'Isaac Z. Schlueter',
'email': 'i@izs.me',
'url': 'http://blog.izs.me',
}],
'codeRepository':
'git+https://github.com/npm/npm',
'description': 'a package manager for JavaScript',
'license': 'https://spdx.org/licenses/Artistic-2.0',
'version': '5.0.3',
'name': 'npm',
'keywords': [
'install',
'modules',
'package manager',
'package.json'
],
'url': 'https://docs.npmjs.com/'
},
'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607')
}]
for result in results:
del result['tool']
# The assertion below returns False sometimes because of nested lists
self.assertEqual(expected_results, results)
def test_npm_bugs_normalization(self):
# valid dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"url": "https://github.com/owner/project/issues",
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'issueTracker': 'https://github.com/owner/project/issues',
'type': 'SoftwareSourceCode',
})
# "invalid" dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'type': 'SoftwareSourceCode',
})
# string
package_json = b"""{
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'issueTracker': 'https://github.com/owner/project/issues',
'type': 'SoftwareSourceCode',
})
def test_npm_repository_normalization(self):
# normal
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git",
"url" : "https://github.com/npm/cli.git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'codeRepository': 'git+https://github.com/npm/cli.git',
'type': 'SoftwareSourceCode',
})
# missing url
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'type': 'SoftwareSourceCode',
})
# github shortcut
package_json = b"""{
"name": "foo",
"repository": "github:npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
expected_result = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'codeRepository': 'git+https://github.com/npm/cli.git',
'type': 'SoftwareSourceCode',
}
self.assertEqual(result, expected_result)
# github shortshortcut
package_json = b"""{
"name": "foo",
"repository": "npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, expected_result)
# gitlab shortcut
package_json = b"""{
"name": "foo",
"repository": "gitlab:user/repo"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'codeRepository': 'git+https://gitlab.com/user/repo.git',
'type': 'SoftwareSourceCode',
})
def test_detect_metadata_package_json(self):
# given
df = [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'cde'
}]
# when
results = detect_metadata(df)
expected_results = {
'NpmMapping': [
b'cde'
]
}
# then
self.assertEqual(expected_results, results)
def test_compute_metadata_valid_codemeta(self):
raw_content = (
b"""{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
{
"@type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"@id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"@id": "https://doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation"
},
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version":"2.0",
"dateCreated":"2017-06-05",
"datePublished":"2017-06-05",
"programmingLanguage": "JSON-LD"
}""") # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description":
"CodeMeta is a concept vocabulary that can "
"be used to standardize the exchange of software metadata "
"across repositories and organizations.",
"name":
"CodeMeta: Minimal metadata schemas for science "
"software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X"
},
{
"type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl":
"https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"id": "https://doi.org/10.13039/100000001",
"type": "Organization",
"name": "National Science Foundation"
},
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
"in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version": "2.0",
"dateCreated": "2017-06-05",
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD"
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_codemeta_alternate_context(self):
raw_content = (
b"""{
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta"
}""") # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_maven(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt',
'codeRepository':
'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
})
def test_compute_metadata_maven_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
def test_compute_metadata_maven_almost_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
def test_compute_metadata_maven_invalid_xml(self):
expected_warning = (
'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
'Error parsing XML from foo')
raw_content = b"""
"""
with self.assertLogs('swh.indexer.metadata_dictionary',
level='WARNING') as cm:
result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs('swh.indexer.metadata_dictionary',
level='WARNING') as cm:
result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_unknown_encoding(self):
expected_warning = (
'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
'Error detecting XML encoding from foo')
raw_content = b"""
"""
with self.assertLogs('swh.indexer.metadata_dictionary',
level='WARNING') as cm:
result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs('swh.indexer.metadata_dictionary',
level='WARNING') as cm:
result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_invalid_encoding(self):
expected_warning = (
'WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:'
'Error unidecoding XML from foo')
raw_content = b"""
"""
with self.assertLogs('swh.indexer.metadata_dictionary',
level='WARNING') as cm:
result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_minimal(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'codeRepository':
'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
})
def test_compute_metadata_maven_empty_nodes(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'codeRepository':
'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
})
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'codeRepository':
'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
})
raw_content = b"""
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'codeRepository':
'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
})
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'codeRepository':
'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
})
raw_content = b"""
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'version': '1.2.3',
})
def test_compute_metadata_maven_invalid_licenses(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
foo
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'codeRepository':
'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
})
def test_compute_metadata_maven_multiple(self):
'''Tests when there are multiple code repos and licenses.'''
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
example
Example Maven Repo
default
http://example.org/maven2
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
MIT license
https://opensource.org/licenses/MIT
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'license': [
'https://www.apache.org/licenses/LICENSE-2.0.txt',
'https://opensource.org/licenses/MIT',
],
'codeRepository': [
'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
'http://example.org/maven2/com/mycompany/app/my-app',
]
})
def test_compute_metadata_pkginfo(self):
raw_content = (b"""\
Metadata-Version: 2.1
Name: swh.core
Version: 0.0.49
Summary: Software Heritage core utilities
Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
Description: swh-core
========
\x20
core library for swh's modules:
- config parser
- hash computations
- serialization
- logging mechanism
\x20
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Description-Content-Type: text/markdown
Provides-Extra: testing
""") # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertCountEqual(result['description'], [
'Software Heritage core utilities', # note the comma here
'swh-core\n'
'========\n'
'\n'
"core library for swh's modules:\n"
'- config parser\n'
'- hash computations\n'
'- serialization\n'
'- logging mechanism\n'
''],
result)
del result['description']
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'url': 'https://forge.softwareheritage.org/diffusion/DCORE/',
'name': 'swh.core',
'author': [{
'type': 'Person',
'name': 'Software Heritage developers',
'email': 'swh-devel@inria.fr',
}],
'version': '0.0.49',
})
def test_compute_metadata_pkginfo_utf8(self):
raw_content = (b'''\
Metadata-Version: 1.1
Name: snowpyt
Description-Content-Type: UNKNOWN
Description: foo
Hydrology N\xc2\xb083
''') # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'snowpyt',
'description': 'foo\nHydrology N°83',
})
def test_compute_metadata_pkginfo_keywords(self):
raw_content = (b"""\
Metadata-Version: 2.1
Name: foo
Keywords: foo bar baz
""") # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'foo',
'keywords': ['foo', 'bar', 'baz'],
})
def test_compute_metadata_pkginfo_license(self):
raw_content = (b"""\
Metadata-Version: 2.1
Name: foo
License: MIT
""") # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'foo',
'license': 'MIT',
})
def test_gemspec_base(self):
raw_content = b"""
Gem::Specification.new do |s|
s.name = 'example'
s.version = '0.1.0'
s.licenses = ['MIT']
s.summary = "This is an example!"
s.description = "Much longer explanation of the example!"
s.authors = ["Ruby Coder"]
s.email = 'rubycoder@example.com'
s.files = ["lib/example.rb"]
s.homepage = 'https://rubygems.org/gems/example'
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(result.pop('description'), [
"This is an example!",
"Much longer explanation of the example!"
])
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'author': [
{
'type': 'Person',
'name': 'Ruby Coder'
}
],
'name': 'example',
'license': 'https://spdx.org/licenses/MIT',
'codeRepository': 'https://rubygems.org/gems/example',
'email': 'rubycoder@example.com',
'version': '0.1.0',
})
def test_gemspec_two_author_fields(self):
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1"]
s.author = "Ruby Coder2"
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(result.pop('author'), [
{
'type': 'Person',
'name': 'Ruby Coder1'
},
{
'type': 'Person',
'name': 'Ruby Coder2'
},
])
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
def test_gemspec_invalid_author(self):
raw_content = b"""
Gem::Specification.new do |s|
s.author = ["Ruby Coder"]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
raw_content = b"""
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'author': [
{
'type': 'Person',
'name': 'Ruby Coder1'
}
],
})
def test_gemspec_alternative_header(self):
raw_content = b"""
require './lib/version'
Gem::Specification.new { |s|
s.name = 'rb-system-with-aliases'
s.summary = 'execute system commands with aliases'
}
"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'rb-system-with-aliases',
'description': 'execute system commands with aliases',
})
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(NpmMapping.mapping)))
def test_npm_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.npm_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=CODEMETA_TERMS))
def test_codemeta_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.codemeta_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(xml_document_strategy(
keys=list(MavenMapping.mapping),
root='project',
xmlns='http://maven.apache.org/POM/4.0.0'))
def test_maven_adversarial(self, doc):
self.maven_mapping.translate(doc)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(strategies.dictionaries(
# keys
strategies.one_of(
strategies.text(),
*map(strategies.just, GemspecMapping.mapping)
),
# values
strategies.recursive(
strategies.characters(),
lambda children: strategies.lists(children, min_size=1)
)
))
def test_gemspec_adversarial(self, doc):
parts = [b'Gem::Specification.new do |s|\n']
for (k, v) in doc.items():
parts.append(' s.{} = {}\n'.format(k, repr(v)).encode())
parts.append(b'end\n')
self.gemspec_mapping.translate(b''.join(parts))
def test_revision_metadata_indexer(self):
metadata_indexer = RevisionMetadataIndexer(
config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()})
assert tool is not None
metadata_indexer.idx_storage.content_metadata_add([{
'indexer_configuration_id': tool['id'],
'id': b'cde',
'metadata': YARN_PARSER_METADATA,
}])
sha1_gits = [
hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
]
metadata_indexer.run(sha1_gits, 'update-dups')
results = list(
metadata_indexer.idx_storage.
revision_intrinsic_metadata_get(sha1_gits))
expected_results = [{
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'tool': TRANSLATOR_TOOL,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}]
for result in results:
del result['tool']['id']
# then
self.assertEqual(expected_results, results)
def test_revision_metadata_indexer_single_root_dir(self):
metadata_indexer = RevisionMetadataIndexer(
config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
# of the revision
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev = metadata_indexer.storage._revisions[rev_id]
subdir_id = rev.directory
rev = attr.evolve(rev, directory=b'123456')
metadata_indexer.storage.directory_add([{
'id': b'123456',
'entries': [{
'name': b'foobar-1.0.0',
'type': 'dir',
'target': subdir_id,
'perms': 16384,
}],
}])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()})
assert tool is not None
metadata_indexer.idx_storage.content_metadata_add([{
'indexer_configuration_id': tool['id'],
'id': b'cde',
'metadata': YARN_PARSER_METADATA,
}])
sha1_gits = [
hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
]
metadata_indexer.run(sha1_gits, 'update-dups')
results = list(
metadata_indexer.idx_storage.
revision_intrinsic_metadata_get(sha1_gits))
expected_results = [{
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'tool': TRANSLATOR_TOOL,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}]
for result in results:
del result['tool']['id']
# then
self.assertEqual(expected_results, results)