Page MenuHomeSoftware Heritage

D1079.id3523.diff
No OneTemporary

D1079.id3523.diff

diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -49,6 +49,34 @@
return IndexerStorage(**args)
+def _check_duplicates(data, key):
+ """
+ If any two dictionaries in `data` have the same value for the
+ key, raises a `ValueError`.
+
+ Values associated to the key must be hashable.
+
+ Args:
+ data (List[dict]): List of dictionaries to be inserted
+ key (str): Name of the key that acts as id.
+
+ >>> _check_duplicates([
+ ... {'id': 'foo', 'data': 'spam'},
+ ... {'id': 'bar', 'data': 'egg'},
+ ... ], 'id')
+ >>> _check_duplicates([
+ ... {'id': 'foo', 'data': 'spam'},
+ ... {'id': 'foo', 'data': 'egg'},
+ ... ], 'id')
+ Traceback (most recent call last):
+ ...
+ ValueError: The same id is present more than once.
+ """
+ if len({item[key] for item in data}) < len(data):
+ raise ValueError(
+ 'The same {} is present more than once.'.format(key))
+
+
class IndexerStorage:
"""SWH Indexer Storage
@@ -216,6 +244,7 @@
default)
"""
+ _check_duplicates(mimetypes, 'id')
db.mktemp_content_mimetype(cur)
db.copy_to(mimetypes, 'tmp_content_mimetype',
['id', 'mimetype', 'encoding', 'indexer_configuration_id'],
@@ -300,6 +329,7 @@
default)
"""
+ _check_duplicates(languages, 'id')
db.mktemp_content_language(cur)
# empty language is mapped to 'unknown'
db.copy_to(
@@ -369,6 +399,8 @@
line, lang
"""
+ _check_duplicates(ctags, 'id')
+
def _convert_ctags(__ctags):
"""Convert ctags dict to list of ctags.
@@ -449,7 +481,7 @@
list: content_license entries which failed due to unknown licenses
"""
- # Then, we add the correct ones
+ _check_duplicates(licenses, 'id')
db.mktemp_content_fossology_license(cur)
db.copy_to(
({
@@ -547,6 +579,8 @@
or skip duplicates (false, the default)
"""
+ _check_duplicates(metadata, 'id')
+
db.mktemp_content_metadata(cur)
db.copy_to(metadata, 'tmp_content_metadata',
@@ -614,6 +648,8 @@
or skip duplicates (false, the default)
"""
+ _check_duplicates(metadata, 'id')
+
db.mktemp_revision_metadata(cur)
db.copy_to(metadata, 'tmp_revision_metadata',
@@ -666,6 +702,8 @@
or skip duplicates (false, the default)
"""
+ _check_duplicates(metadata, 'origin_id')
+
db.mktemp_origin_intrinsic_metadata(cur)
db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -130,6 +130,10 @@
(true) or skip duplicates (false)
"""
+ data = list(data)
+ if len({x['id'] for x in data}) < len(data):
+ # For "exception-compatibility" with the pgsql backend
+ raise ValueError('The same id is present more than once.')
for item in data:
item = item.copy()
tool_id = item.pop('indexer_configuration_id')
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -223,6 +223,41 @@
# data did change as the v2 was used to overwrite v1
self.assertEqual(actual_data, expected_data_v2)
+ def add__duplicate_twice(self):
+ # given
+ tool_id = self.tools[tool_name]['id']
+
+ data_rev1 = {
+ 'id': self.revision_id_2,
+ **example_data1,
+ 'indexer_configuration_id': tool_id
+ }
+
+ data_rev2 = {
+ 'id': self.revision_id_2,
+ **example_data2,
+ 'indexer_configuration_id': tool_id
+ }
+
+ # when
+ endpoint(self, 'add')([data_rev1])
+
+ with self.assertRaises(ValueError):
+ endpoint(self, 'add')(
+ [data_rev2, data_rev2],
+ conflict_update=True)
+
+ # then
+ actual_data = list(endpoint(self, 'get')(
+ [self.revision_id_2, self.revision_id_1]))
+
+ expected_data = [{
+ 'id': self.revision_id_2,
+ **example_data1,
+ 'tool': self.tools[tool_name]
+ }]
+ self.assertEqual(actual_data, expected_data)
+
@rename
def get(self):
# given
@@ -255,6 +290,7 @@
missing,
add__drop_duplicate,
add__update_in_place_duplicate,
+ add__duplicate_twice,
get,
)
@@ -300,6 +336,7 @@
test_content_mimetype_missing,
test_content_mimetype_add__drop_duplicate,
test_content_mimetype_add__update_in_place_duplicate,
+ test_content_mimetype_add__duplicate_twice,
test_content_mimetype_get,
) = gen_generic_endpoint_tests(
endpoint_type='content_mimetype',
@@ -319,6 +356,7 @@
test_content_language_missing,
test_content_language_add__drop_duplicate,
test_content_language_add__update_in_place_duplicate,
+ test_content_language_add__duplicate_twice,
test_content_language_get,
) = gen_generic_endpoint_tests(
endpoint_type='content_language',
@@ -337,6 +375,7 @@
# the following tests are disabled because CTAGS behave differently
_, # test_content_ctags_add__drop_duplicate,
_, # test_content_ctags_add__update_in_place_duplicate,
+ _, # test_content_ctags_add__duplicate_twice,
_, # test_content_ctags_get,
) = gen_generic_endpoint_tests(
endpoint_type='content_ctags',
@@ -743,6 +782,7 @@
test_content_metadata_missing,
test_content_metadata_add__drop_duplicate,
test_content_metadata_add__update_in_place_duplicate,
+ test_content_metadata_add__duplicate_twice,
test_content_metadata_get,
) = gen_generic_endpoint_tests(
endpoint_type='content_metadata',
@@ -773,6 +813,7 @@
test_revision_metadata_missing,
test_revision_metadata_add__drop_duplicate,
test_revision_metadata_add__update_in_place_duplicate,
+ test_revision_metadata_add__duplicate_twice,
test_revision_metadata_get,
) = gen_generic_endpoint_tests(
endpoint_type='revision_metadata',

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 2:38 PM (1 w, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221970

Event Timeline