diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -215,6 +215,7 @@ default) """ + self._check_duplicates(mimetypes, 'id') db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], @@ -299,6 +300,7 @@ default) """ + self._check_duplicates(languages, 'id') db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( @@ -368,6 +370,8 @@ line, lang """ + self._check_duplicates(ctags, 'id') + def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. @@ -448,7 +452,7 @@ list: content_license entries which failed due to unknown licenses """ - # Then, we add the correct ones + self._check_duplicates(licenses, 'id') db.mktemp_content_fossology_license(cur) db.copy_to( ({ @@ -546,6 +550,8 @@ or skip duplicates (false, the default) """ + self._check_duplicates(metadata, 'id') + db.mktemp_content_metadata(cur) db.copy_to(metadata, 'tmp_content_metadata', @@ -613,6 +619,8 @@ or skip duplicates (false, the default) """ + self._check_duplicates(metadata, 'id') + db.mktemp_revision_metadata(cur) db.copy_to(metadata, 'tmp_revision_metadata', @@ -665,6 +673,8 @@ or skip duplicates (false, the default) """ + self._check_duplicates(metadata, 'origin_id') + db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', @@ -754,3 +764,19 @@ if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) + + def _check_duplicates(self, data, column): + """ + If any two dictionaries in `data` have the same value for the + column, raises a `ValueError`. + + Prevents 'psycopg2.ProgrammingError: ON CONFLICT DO UPDATE ' + 'command cannot affect row a second time' when conflict_update=True. + + Args: + data (List[dict]): List of dictionaries to be inserted + column (str): Name of the column that acts as id. + """ + if len({item[column] for item in data}) < len(data): + raise ValueError( + 'The same {} is present more than once.'.format(column)) diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -128,6 +128,10 @@ (true) or skip duplicates (false) """ + data = list(data) + if len({x['id'] for x in data}) < len(data): + # For "exception-compatibility" with the pgsql backend + raise ValueError('The same id is present more than once.') for item in data: item = item.copy() tool_id = item.pop('indexer_configuration_id') diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1399,6 +1399,50 @@ # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadata, expected_metadata_v2) + def test_revision_metadata_add_duplicate_twice(self): + # given + tool_id = self.tools['swh-metadata-detector']['id'] + + metadata_rev1 = { + 'id': self.revision_id_2, + 'translated_metadata': { + 'identifier': {'@id': 'foo'}, + }, + 'mappings': ['mapping1', 'mapping2'], + 'indexer_configuration_id': tool_id + } + + metadata_rev2 = { + 'id': self.revision_id_2, + 'translated_metadata': { + 'identifier': {'@id': 'bar'}, + }, + 'mappings': ['mapping1', 'mapping2'], + 'indexer_configuration_id': tool_id + } + + # when + self.storage.revision_metadata_add([metadata_rev1]) + + with self.assertRaises(ValueError): + self.storage.revision_metadata_add( + [metadata_rev2, metadata_rev2], + conflict_update=True) + + # then + actual_metadata = list(self.storage.revision_metadata_get( + [self.revision_id_2, self.revision_id_1])) + + expected_metadata = [{ + 'id': self.revision_id_2, + 'translated_metadata': { + 'identifier': {'@id': 'foo'}, + }, + 'mappings': ['mapping1', 'mapping2'], + 'tool': self.tools['swh-metadata-detector'] + }] + self.assertEqual(actual_metadata, expected_metadata) + def test_origin_intrinsic_metadata_get(self): # given tool_id = self.tools['swh-metadata-detector']['id']