Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163746
D1079.id3523.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D1079.id3523.diff
View Options
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -49,6 +49,34 @@
return IndexerStorage(**args)
+def _check_duplicates(data, key):
+ """
+ If any two dictionaries in `data` have the same value for the
+ key, raises a `ValueError`.
+
+ Values associated to the key must be hashable.
+
+ Args:
+ data (List[dict]): List of dictionaries to be inserted
+ key (str): Name of the key that acts as id.
+
+ >>> _check_duplicates([
+ ... {'id': 'foo', 'data': 'spam'},
+ ... {'id': 'bar', 'data': 'egg'},
+ ... ], 'id')
+ >>> _check_duplicates([
+ ... {'id': 'foo', 'data': 'spam'},
+ ... {'id': 'foo', 'data': 'egg'},
+ ... ], 'id')
+ Traceback (most recent call last):
+ ...
+ ValueError: The same id is present more than once.
+ """
+ if len({item[key] for item in data}) < len(data):
+ raise ValueError(
+ 'The same {} is present more than once.'.format(key))
+
+
class IndexerStorage:
"""SWH Indexer Storage
@@ -216,6 +244,7 @@
default)
"""
+ _check_duplicates(mimetypes, 'id')
db.mktemp_content_mimetype(cur)
db.copy_to(mimetypes, 'tmp_content_mimetype',
['id', 'mimetype', 'encoding', 'indexer_configuration_id'],
@@ -300,6 +329,7 @@
default)
"""
+ _check_duplicates(languages, 'id')
db.mktemp_content_language(cur)
# empty language is mapped to 'unknown'
db.copy_to(
@@ -369,6 +399,8 @@
line, lang
"""
+ _check_duplicates(ctags, 'id')
+
def _convert_ctags(__ctags):
"""Convert ctags dict to list of ctags.
@@ -449,7 +481,7 @@
list: content_license entries which failed due to unknown licenses
"""
- # Then, we add the correct ones
+ _check_duplicates(licenses, 'id')
db.mktemp_content_fossology_license(cur)
db.copy_to(
({
@@ -547,6 +579,8 @@
or skip duplicates (false, the default)
"""
+ _check_duplicates(metadata, 'id')
+
db.mktemp_content_metadata(cur)
db.copy_to(metadata, 'tmp_content_metadata',
@@ -614,6 +648,8 @@
or skip duplicates (false, the default)
"""
+ _check_duplicates(metadata, 'id')
+
db.mktemp_revision_metadata(cur)
db.copy_to(metadata, 'tmp_revision_metadata',
@@ -666,6 +702,8 @@
or skip duplicates (false, the default)
"""
+ _check_duplicates(metadata, 'origin_id')
+
db.mktemp_origin_intrinsic_metadata(cur)
db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -130,6 +130,10 @@
(true) or skip duplicates (false)
"""
+ data = list(data)
+ if len({x['id'] for x in data}) < len(data):
+ # For "exception-compatibility" with the pgsql backend
+ raise ValueError('The same id is present more than once.')
for item in data:
item = item.copy()
tool_id = item.pop('indexer_configuration_id')
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -223,6 +223,41 @@
# data did change as the v2 was used to overwrite v1
self.assertEqual(actual_data, expected_data_v2)
+ def add__duplicate_twice(self):
+ # given
+ tool_id = self.tools[tool_name]['id']
+
+ data_rev1 = {
+ 'id': self.revision_id_2,
+ **example_data1,
+ 'indexer_configuration_id': tool_id
+ }
+
+ data_rev2 = {
+ 'id': self.revision_id_2,
+ **example_data2,
+ 'indexer_configuration_id': tool_id
+ }
+
+ # when
+ endpoint(self, 'add')([data_rev1])
+
+ with self.assertRaises(ValueError):
+ endpoint(self, 'add')(
+ [data_rev2, data_rev2],
+ conflict_update=True)
+
+ # then
+ actual_data = list(endpoint(self, 'get')(
+ [self.revision_id_2, self.revision_id_1]))
+
+ expected_data = [{
+ 'id': self.revision_id_2,
+ **example_data1,
+ 'tool': self.tools[tool_name]
+ }]
+ self.assertEqual(actual_data, expected_data)
+
@rename
def get(self):
# given
@@ -255,6 +290,7 @@
missing,
add__drop_duplicate,
add__update_in_place_duplicate,
+ add__duplicate_twice,
get,
)
@@ -300,6 +336,7 @@
test_content_mimetype_missing,
test_content_mimetype_add__drop_duplicate,
test_content_mimetype_add__update_in_place_duplicate,
+ test_content_mimetype_add__duplicate_twice,
test_content_mimetype_get,
) = gen_generic_endpoint_tests(
endpoint_type='content_mimetype',
@@ -319,6 +356,7 @@
test_content_language_missing,
test_content_language_add__drop_duplicate,
test_content_language_add__update_in_place_duplicate,
+ test_content_language_add__duplicate_twice,
test_content_language_get,
) = gen_generic_endpoint_tests(
endpoint_type='content_language',
@@ -337,6 +375,7 @@
# the following tests are disabled because CTAGS behave differently
_, # test_content_ctags_add__drop_duplicate,
_, # test_content_ctags_add__update_in_place_duplicate,
+ _, # test_content_ctags_add__duplicate_twice,
_, # test_content_ctags_get,
) = gen_generic_endpoint_tests(
endpoint_type='content_ctags',
@@ -743,6 +782,7 @@
test_content_metadata_missing,
test_content_metadata_add__drop_duplicate,
test_content_metadata_add__update_in_place_duplicate,
+ test_content_metadata_add__duplicate_twice,
test_content_metadata_get,
) = gen_generic_endpoint_tests(
endpoint_type='content_metadata',
@@ -773,6 +813,7 @@
test_revision_metadata_missing,
test_revision_metadata_add__drop_duplicate,
test_revision_metadata_add__update_in_place_duplicate,
+ test_revision_metadata_add__duplicate_twice,
test_revision_metadata_get,
) = gen_generic_endpoint_tests(
endpoint_type='revision_metadata',
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 2:38 PM (1 w, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221970
Attached To
D1079: Add checks in the idx_storage that the same content/rev/orig is not present twice in the new data.
Event Timeline
Log In to Comment