diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -49,6 +49,34 @@ return IndexerStorage(**args) +def _check_duplicates(data, key): + """ + If any two dictionaries in `data` have the same value for the + key, raises a `ValueError`. + + Values associated to the key must be hashable. + + Args: + data (List[dict]): List of dictionaries to be inserted + key (str): Name of the key that acts as id. + + >>> _check_duplicates([ + ... {'id': 'foo', 'data': 'spam'}, + ... {'id': 'bar', 'data': 'egg'}, + ... ], 'id') + >>> _check_duplicates([ + ... {'id': 'foo', 'data': 'spam'}, + ... {'id': 'foo', 'data': 'egg'}, + ... ], 'id') + Traceback (most recent call last): + ... + ValueError: The same id is present more than once. + """ + if len({item[key] for item in data}) < len(data): + raise ValueError( + 'The same {} is present more than once.'.format(key)) + + class IndexerStorage: """SWH Indexer Storage @@ -216,6 +244,7 @@ default) """ + _check_duplicates(mimetypes, 'id') db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], @@ -300,6 +329,7 @@ default) """ + _check_duplicates(languages, 'id') db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( @@ -369,6 +399,8 @@ line, lang """ + _check_duplicates(ctags, 'id') + def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. @@ -449,7 +481,7 @@ list: content_license entries which failed due to unknown licenses """ - # Then, we add the correct ones + _check_duplicates(licenses, 'id') db.mktemp_content_fossology_license(cur) db.copy_to( ({ @@ -547,6 +579,8 @@ or skip duplicates (false, the default) """ + _check_duplicates(metadata, 'id') + db.mktemp_content_metadata(cur) db.copy_to(metadata, 'tmp_content_metadata', @@ -614,6 +648,8 @@ or skip duplicates (false, the default) """ + _check_duplicates(metadata, 'id') + db.mktemp_revision_metadata(cur) db.copy_to(metadata, 'tmp_revision_metadata', @@ -666,6 +702,8 @@ or skip duplicates (false, the default) """ + _check_duplicates(metadata, 'origin_id') + db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -130,6 +130,10 @@ (true) or skip duplicates (false) """ + data = list(data) + if len({x['id'] for x in data}) < len(data): + # For "exception-compatibility" with the pgsql backend + raise ValueError('The same id is present more than once.') for item in data: item = item.copy() tool_id = item.pop('indexer_configuration_id') diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -223,6 +223,41 @@ # data did change as the v2 was used to overwrite v1 self.assertEqual(actual_data, expected_data_v2) + def add__duplicate_twice(self): + # given + tool_id = self.tools[tool_name]['id'] + + data_rev1 = { + 'id': self.revision_id_2, + **example_data1, + 'indexer_configuration_id': tool_id + } + + data_rev2 = { + 'id': self.revision_id_2, + **example_data2, + 'indexer_configuration_id': tool_id + } + + # when + endpoint(self, 'add')([data_rev1]) + + with self.assertRaises(ValueError): + endpoint(self, 'add')( + [data_rev2, data_rev2], + conflict_update=True) + + # then + actual_data = list(endpoint(self, 'get')( + [self.revision_id_2, self.revision_id_1])) + + expected_data = [{ + 'id': self.revision_id_2, + **example_data1, + 'tool': self.tools[tool_name] + }] + self.assertEqual(actual_data, expected_data) + @rename def get(self): # given @@ -255,6 +290,7 @@ missing, add__drop_duplicate, add__update_in_place_duplicate, + add__duplicate_twice, get, ) @@ -300,6 +336,7 @@ test_content_mimetype_missing, test_content_mimetype_add__drop_duplicate, test_content_mimetype_add__update_in_place_duplicate, + test_content_mimetype_add__duplicate_twice, test_content_mimetype_get, ) = gen_generic_endpoint_tests( endpoint_type='content_mimetype', @@ -319,6 +356,7 @@ test_content_language_missing, test_content_language_add__drop_duplicate, test_content_language_add__update_in_place_duplicate, + test_content_language_add__duplicate_twice, test_content_language_get, ) = gen_generic_endpoint_tests( endpoint_type='content_language', @@ -337,6 +375,7 @@ # the following tests are disabled because CTAGS behave differently _, # test_content_ctags_add__drop_duplicate, _, # test_content_ctags_add__update_in_place_duplicate, + _, # test_content_ctags_add__duplicate_twice, _, # test_content_ctags_get, ) = gen_generic_endpoint_tests( endpoint_type='content_ctags', @@ -743,6 +782,7 @@ test_content_metadata_missing, test_content_metadata_add__drop_duplicate, test_content_metadata_add__update_in_place_duplicate, + test_content_metadata_add__duplicate_twice, test_content_metadata_get, ) = gen_generic_endpoint_tests( endpoint_type='content_metadata', @@ -773,6 +813,7 @@ test_revision_metadata_missing, test_revision_metadata_add__drop_duplicate, test_revision_metadata_add__update_in_place_duplicate, + test_revision_metadata_add__duplicate_twice, test_revision_metadata_get, ) = gen_generic_endpoint_tests( endpoint_type='revision_metadata',