diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -140,8 +140,19 @@ merged_document = collections.defaultdict(list) for document in documents: for (key, values) in document.items(): - for value in values: - if value not in merged_document[key]: - merged_document[key].append(value) + if key == '@id': + # @id does not get expanded to a list + value = values + + # Only one @id is allowed, move it to sameAs + if '@id' not in merged_document: + merged_document['@id'] = value + elif value != merged_document['@id']: + if value not in merged_document[SCHEMA_URI + 'sameAs']: + merged_document[SCHEMA_URI + 'sameAs'].append(value) + else: + for value in values: + if value not in merged_document[key]: + merged_document[key].append(value) return compact(merged_document) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -237,6 +237,58 @@ } self.assertEqual(expected_results, results) + def test_merge_documents_ids(self): + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'name': 'test_1', + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test2', + 'name': 'test_2', + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'schema:sameAs': 'http://example.org/test2', + "name": ['test_1', 'test_2'] + } + self.assertEqual(expected_results, results) + + def test_merge_documents_duplicate_ids(self): + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'name': 'test_1', + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'name': 'test_1b', + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test2', + 'name': 'test_2', + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'id': 'http://example.org/test1', + 'schema:sameAs': 'http://example.org/test2', + "name": ['test_1', 'test_1b', 'test_2'] + } + self.assertEqual(expected_results, results) + def test_index_content_metadata_npm(self): """ testing NPM with package.json