diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -186,7 +186,20 @@ merged_document[SCHEMA_URI + 'sameAs'].append(value) else: for value in values: - if value not in merged_document[key]: + if isinstance(value, dict) and set(value) == {'@list'}: + # Value is of the form {'@list': [item1, item2]} + # instead of the usual [item1, item2]. + # We need to merge the inner lists (and mostly + # preserve order). + merged_value = merged_document.setdefault( + key, {'@list': []}) + for subvalue in value['@list']: + # merged_value must be of the form + # {'@list': [item1, item2]}; as it is the same + # type as value, which is an @list. + if subvalue not in merged_value['@list']: + merged_value['@list'].append(subvalue) + elif value not in merged_document[key]: merged_document[key].append(value) return compact(merged_document) diff --git a/swh/indexer/tests/test_codemeta.py b/swh/indexer/tests/test_codemeta.py --- a/swh/indexer/tests/test_codemeta.py +++ b/swh/indexer/tests/test_codemeta.py @@ -156,3 +156,128 @@ "name": ['test_1', 'test_1b', 'test_2'] } assert results == expected_results + + +def test_merge_documents_lists(): + """Tests merging two @list elements.""" + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_1'}, + ] + }, + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_2'}, + ] + }, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': [ + {'name': 'test_1'}, + {'name': 'test_2'}, + ], + } + assert results == expected_results + + +def test_merge_documents_lists_duplicates(): + """Tests merging two @list elements with a duplicate subelement.""" + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_1'}, + ] + }, + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_2'}, + {'name': 'test_1'}, + ] + }, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': [ + {'name': 'test_1'}, + {'name': 'test_2'}, + ], + } + assert results == expected_results + + +def test_merge_documents_list_left(): + """Tests merging a singleton with an @list.""" + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': {'name': 'test_1'}, + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_2'}, + ] + }, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': [ + {'name': 'test_1'}, + {'name': 'test_2'}, + ], + } + assert results == expected_results + + +def test_merge_documents_list_right(): + """Tests merging an @list with a singleton.""" + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_1'}, + ] + }, + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': {'name': 'test_2'}, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': [ + {'name': 'test_1'}, + {'name': 'test_2'}, + ], + } + assert results == expected_results