Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9313519
D2783.id9922.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D2783.id9922.diff
View Options
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -93,7 +93,8 @@
algo, content.get_hash(algo))
if len(pks) > 1:
# There are more than the one we just inserted.
- raise HashCollision(algo, content.get_hash(algo), pks)
+ raise HashCollision(
+ algo, content.get_hash(algo), content.hashes())
summary = {
'content:add': content_add,
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -92,7 +92,7 @@
hash_ = content.get_hash(algorithm)
if hash_ in self._content_indexes[algorithm]\
and (algorithm not in {'blake2s256', 'sha256'}):
- raise HashCollision(algorithm, hash_, key)
+ raise HashCollision(algorithm, hash_, content.hashes())
for algorithm in DEFAULT_ALGORITHMS:
hash_ = content.get_hash(algorithm)
self._content_indexes[algorithm][hash_].add(key)
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -33,7 +33,9 @@
from .exc import StorageArgumentException, StorageDBError
from .algos import diff
from .metrics import timed, send_metric, process_metrics
-from .utils import get_partition_bounds_bytes
+from .utils import (
+ get_partition_bounds_bytes, extract_collision_hash
+)
from .writer import JournalWriter
@@ -158,14 +160,27 @@
except psycopg2.IntegrityError as e:
if e.diag.sqlstate == '23505' and \
e.diag.table_name == 'content':
- constraint_to_hash_name = {
- 'content_pkey': 'sha1',
- 'content_sha1_git_idx': 'sha1_git',
- 'content_sha256_idx': 'sha256',
+ message_detail = e.diag.message_detail
+ if message_detail:
+ hash_name, hash_id = extract_collision_hash(message_detail)
+ collision_content_hashes = [
+ c.hashes() for c in content
+ if getattr(c, hash_name) == hash_id
+ ][0]
+ else:
+ constraint_to_hash_name = {
+ 'content_pkey': 'sha1',
+ 'content_sha1_git_idx': 'sha1_git',
+ 'content_sha256_idx': 'sha256',
}
- colliding_hash_name = constraint_to_hash_name \
- .get(e.diag.constraint_name)
- raise HashCollision(colliding_hash_name) from None
+ hash_name = constraint_to_hash_name \
+ .get(e.diag.constraint_name)
+ hash_id = None
+ collision_content_hashes = None
+
+ raise HashCollision(
+ hash_name, hash_id, collision_content_hashes
+ ) from None
else:
raise
diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_utils.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model.hashutil import hash_to_bytes
+from swh.storage.utils import extract_collision_hash
+
+
+def test_extract_collision_hash():
+ for msg, expected_result in [
+ (
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...',
+ ('sha1', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4689')),
+ ),
+ (
+ 'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa
+ ('sha1_git', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4699')),
+ ),
+ (
+ 'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa
+ ('sha256', hash_to_bytes(
+ '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa
+ ),
+ (
+ 'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa
+ ('blake2s', hash_to_bytes(
+ 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa
+ ),
+ ]:
+ assert extract_collision_hash(msg) == expected_result
+
+ assert extract_collision_hash('Nothing matching') is None
diff --git a/swh/storage/utils.py b/swh/storage/utils.py
--- a/swh/storage/utils.py
+++ b/swh/storage/utils.py
@@ -1,10 +1,14 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import re
+
from typing import Optional, Tuple
+from swh.model.hashutil import hash_to_bytes
+
def _is_power_of_two(n: int) -> bool:
return n > 0 and n & (n-1) == 0
@@ -40,3 +44,25 @@
end = None if i == n-1 \
else (partition_size*(i+1)).to_bytes(nb_bytes, 'big')
return (start, end)
+
+
+def extract_collision_hash(error_message: str) -> Optional[Tuple[str, bytes]]:
+ """Utilities to extract the hash information from a hash collision error.
+
+ Hash collision error message are of the form:
+ 'Key (<hash-type>)=(<double-escaped-hash) already exists.'
+
+ for example:
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) already exists.'
+
+ Return:
+ A formatted string
+
+ """
+ pattern = r'\w* \((?P<type>[a-z0-9_]+)\)=\((?P<id>[\\a-f0-9x]+)\) \w*'
+ result = re.match(pattern, error_message)
+ if result:
+ hash_type = result.group('type')
+ hash_id = result.group('id').replace('\\x', '')
+ return hash_type, hash_to_bytes(hash_id)
+ return None
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Jul 2, 11:43 AM (1 w, 12 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234031
Attached To
D2783: storage(s): Identify and provide the collision hash(es) in HashCollision exception
Event Timeline
Log In to Comment