diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -131,6 +131,40 @@ options={'documentLoader': _document_loader}) +def merge_values(v1, v2): + """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, + returns `{"@list": l1 + l2}`. + Otherwise, make them lists (if they are not already) and concatenate + them. + + >>> merge_values('a', 'b') + ['a', 'b'] + >>> merge_values(['a', 'b'], 'c') + ['a', 'b', 'c'] + >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) + {'@list': ['a', 'b', 'c']} + """ + if v1 is None: + return v2 + elif v2 is None: + return v1 + elif isinstance(v1, dict) and set(v1) == {'@list'}: + assert isinstance(v1['@list'], list) + if isinstance(v2, dict) and set(v2) == {'@list'}: + assert isinstance(v2['@list'], list) + return {'@list': v1['@list'] + v2['@list']} + else: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + else: + if isinstance(v2, dict) and '@list' in v2: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + if not isinstance(v1, list): + v1 = [v1] + if not isinstance(v2, list): + v2 = [v2] + return v1 + v2 + + def merge_documents(documents): """Takes a list of metadata dicts, each generated from a different metadata file, and merges them. diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -10,41 +10,7 @@ from typing import List from swh.indexer.codemeta import SCHEMA_URI -from swh.indexer.codemeta import compact - - -def merge_values(v1, v2): - """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, - returns `{"@list": l1 + l2}`. - Otherwise, make them lists (if they are not already) and concatenate - them. - - >>> merge_values('a', 'b') - ['a', 'b'] - >>> merge_values(['a', 'b'], 'c') - ['a', 'b', 'c'] - >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) - {'@list': ['a', 'b', 'c']} - """ - if v1 is None: - return v2 - elif v2 is None: - return v1 - elif isinstance(v1, dict) and set(v1) == {'@list'}: - assert isinstance(v1['@list'], list) - if isinstance(v2, dict) and set(v2) == {'@list'}: - assert isinstance(v2['@list'], list) - return {'@list': v1['@list'] + v2['@list']} - else: - raise ValueError('Cannot merge %r and %r' % (v1, v2)) - else: - if isinstance(v2, dict) and '@list' in v2: - raise ValueError('Cannot merge %r and %r' % (v1, v2)) - if not isinstance(v1, list): - v1 = [v1] - if not isinstance(v2, list): - v2 = [v2] - return v1 + v2 +from swh.indexer.codemeta import compact, merge_values class BaseMapping(metaclass=abc.ABCMeta): diff --git a/swh/indexer/tests/test_codemeta.py b/swh/indexer/tests/test_codemeta.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/test_codemeta.py @@ -0,0 +1,158 @@ +# Copyright (C) 2018-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.indexer.codemeta import merge_documents, merge_values +from swh.indexer.codemeta import CROSSWALK_TABLE + + +def test_crosstable(): + assert CROSSWALK_TABLE['NodeJS'] == { + 'repository': 'http://schema.org/codeRepository', + 'os': 'http://schema.org/operatingSystem', + 'cpu': 'http://schema.org/processorRequirements', + 'engines': + 'http://schema.org/processorRequirements', + 'author': 'http://schema.org/author', + 'author.email': 'http://schema.org/email', + 'author.name': 'http://schema.org/name', + 'contributor': 'http://schema.org/contributor', + 'keywords': 'http://schema.org/keywords', + 'license': 'http://schema.org/license', + 'version': 'http://schema.org/version', + 'description': 'http://schema.org/description', + 'name': 'http://schema.org/name', + 'bugs': 'https://codemeta.github.io/terms/issueTracker', + 'homepage': 'http://schema.org/url' + } + + +def test_merge_values(): + assert merge_values('a', 'b') == ['a', 'b'] + assert merge_values(['a', 'b'], 'c') == ['a', 'b', 'c'] + assert merge_values('a', ['b', 'c']) == ['a', 'b', 'c'] + + assert merge_values({'@list': ['a']}, {'@list': ['b']}) \ + == {'@list': ['a', 'b']} + assert merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) \ + == {'@list': ['a', 'b', 'c']} + + with pytest.raises(ValueError): + merge_values({'@list': ['a']}, 'b') + with pytest.raises(ValueError): + merge_values('a', {'@list': ['b']}) + with pytest.raises(ValueError): + merge_values({'@list': ['a']}, ['b']) + with pytest.raises(ValueError): + merge_values(['a'], {'@list': ['b']}) + + assert merge_values('a', None) == 'a' + assert merge_values(['a', 'b'], None) == ['a', 'b'] + assert merge_values(None, ['b', 'c']) == ['b', 'c'] + assert merge_values({'@list': ['a']}, None) == {'@list': ['a']} + assert merge_values(None, {'@list': ['a']}) == {'@list': ['a']} + + +def test_merge_documents(): + """ + Test the creation of a coherent minimal metadata set + """ + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'test_1', + 'version': '0.0.2', + 'description': 'Simple package.json test for indexer', + 'codeRepository': + 'git+https://github.com/moranegg/metadata_test', + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'test_0_1', + 'version': '0.0.2', + 'description': 'Simple package.json test for indexer', + 'codeRepository': + 'git+https://github.com/moranegg/metadata_test' + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'test_metadata', + 'version': '0.0.2', + 'author': { + 'type': 'Person', + 'name': 'moranegg', + }, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + "version": '0.0.2', + "description": 'Simple package.json test for indexer', + "name": ['test_1', 'test_0_1', 'test_metadata'], + "author": [{ + 'type': 'Person', + 'name': 'moranegg' + }], + "codeRepository": + 'git+https://github.com/moranegg/metadata_test', + } + assert results == expected_results + + +def test_merge_documents_ids(): + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'name': 'test_1', + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test2', + 'name': 'test_2', + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'schema:sameAs': 'http://example.org/test2', + "name": ['test_1', 'test_2'] + } + assert results == expected_results + + +def test_merge_documents_duplicate_ids(): + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'name': 'test_1', + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'name': 'test_1b', + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test2', + 'name': 'test_2', + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'schema:sameAs': 'http://example.org/test2', + "name": ['test_1', 'test_1b', 'test_2'] + } + assert results == expected_results diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -12,10 +12,8 @@ from swh.model.hashutil import hash_to_bytes -from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE -from swh.indexer.codemeta import merge_documents +from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_dictionary import MAPPINGS -from swh.indexer.metadata_dictionary.base import merge_values from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping @@ -72,69 +70,6 @@ self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']() self.gemspec_mapping = MAPPINGS['GemspecMapping']() - def test_crosstable(self): - self.assertEqual(CROSSWALK_TABLE['NodeJS'], { - 'repository': 'http://schema.org/codeRepository', - 'os': 'http://schema.org/operatingSystem', - 'cpu': 'http://schema.org/processorRequirements', - 'engines': - 'http://schema.org/processorRequirements', - 'author': 'http://schema.org/author', - 'author.email': 'http://schema.org/email', - 'author.name': 'http://schema.org/name', - 'contributor': 'http://schema.org/contributor', - 'keywords': 'http://schema.org/keywords', - 'license': 'http://schema.org/license', - 'version': 'http://schema.org/version', - 'description': 'http://schema.org/description', - 'name': 'http://schema.org/name', - 'bugs': 'https://codemeta.github.io/terms/issueTracker', - 'homepage': 'http://schema.org/url' - }) - - def test_merge_values(self): - self.assertEqual( - merge_values('a', 'b'), - ['a', 'b']) - self.assertEqual( - merge_values(['a', 'b'], 'c'), - ['a', 'b', 'c']) - self.assertEqual( - merge_values('a', ['b', 'c']), - ['a', 'b', 'c']) - - self.assertEqual( - merge_values({'@list': ['a']}, {'@list': ['b']}), - {'@list': ['a', 'b']}) - self.assertEqual( - merge_values({'@list': ['a', 'b']}, {'@list': ['c']}), - {'@list': ['a', 'b', 'c']}) - - with self.assertRaises(ValueError): - merge_values({'@list': ['a']}, 'b') - with self.assertRaises(ValueError): - merge_values('a', {'@list': ['b']}) - with self.assertRaises(ValueError): - merge_values({'@list': ['a']}, ['b']) - with self.assertRaises(ValueError): - merge_values(['a'], {'@list': ['b']}) - - self.assertEqual( - merge_values('a', None), - 'a') - self.assertEqual( - merge_values(['a', 'b'], None), - ['a', 'b']) - self.assertEqual( - merge_values(None, ['b', 'c']), - ['b', 'c']) - self.assertEqual( - merge_values({'@list': ['a']}, None), - {'@list': ['a']}) - self.assertEqual( - merge_values(None, {'@list': ['a']}), - {'@list': ['a']}) - def test_compute_metadata_none(self): """ testing content empty content is empty @@ -190,105 +125,6 @@ # then self.assertEqual(declared_metadata, result) - def test_merge_documents(self): - """ - Test the creation of a coherent minimal metadata set - """ - # given - metadata_list = [{ - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'name': 'test_1', - 'version': '0.0.2', - 'description': 'Simple package.json test for indexer', - 'codeRepository': - 'git+https://github.com/moranegg/metadata_test', - }, { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'name': 'test_0_1', - 'version': '0.0.2', - 'description': 'Simple package.json test for indexer', - 'codeRepository': - 'git+https://github.com/moranegg/metadata_test' - }, { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'name': 'test_metadata', - 'version': '0.0.2', - 'author': { - 'type': 'Person', - 'name': 'moranegg', - }, - }] - - # when - results = merge_documents(metadata_list) - - # then - expected_results = { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - "version": '0.0.2', - "description": 'Simple package.json test for indexer', - "name": ['test_1', 'test_0_1', 'test_metadata'], - "author": [{ - 'type': 'Person', - 'name': 'moranegg' - }], - "codeRepository": - 'git+https://github.com/moranegg/metadata_test', - } - self.assertEqual(expected_results, results) - - def test_merge_documents_ids(self): - # given - metadata_list = [{ - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'id': 'http://example.org/test1', - 'name': 'test_1', - }, { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'id': 'http://example.org/test2', - 'name': 'test_2', - }] - - # when - results = merge_documents(metadata_list) - - # then - expected_results = { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'id': 'http://example.org/test1', - 'schema:sameAs': 'http://example.org/test2', - "name": ['test_1', 'test_2'] - } - self.assertEqual(expected_results, results) - - def test_merge_documents_duplicate_ids(self): - # given - metadata_list = [{ - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'id': 'http://example.org/test1', - 'name': 'test_1', - }, { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'id': 'http://example.org/test1', - 'name': 'test_1b', - }, { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'id': 'http://example.org/test2', - 'name': 'test_2', - }] - - # when - results = merge_documents(metadata_list) - - # then - expected_results = { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'id': 'http://example.org/test1', - 'schema:sameAs': 'http://example.org/test2', - "name": ['test_1', 'test_1b', 'test_2'] - } - self.assertEqual(expected_results, results) - def test_index_content_metadata_npm(self): """ testing NPM with package.json