Page MenuHomeSoftware Heritage

D2783.id9922.diff
No OneTemporary

D2783.id9922.diff

diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -93,7 +93,8 @@
algo, content.get_hash(algo))
if len(pks) > 1:
# There are more than the one we just inserted.
- raise HashCollision(algo, content.get_hash(algo), pks)
+ raise HashCollision(
+ algo, content.get_hash(algo), content.hashes())
summary = {
'content:add': content_add,
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -92,7 +92,7 @@
hash_ = content.get_hash(algorithm)
if hash_ in self._content_indexes[algorithm]\
and (algorithm not in {'blake2s256', 'sha256'}):
- raise HashCollision(algorithm, hash_, key)
+ raise HashCollision(algorithm, hash_, content.hashes())
for algorithm in DEFAULT_ALGORITHMS:
hash_ = content.get_hash(algorithm)
self._content_indexes[algorithm][hash_].add(key)
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -33,7 +33,9 @@
from .exc import StorageArgumentException, StorageDBError
from .algos import diff
from .metrics import timed, send_metric, process_metrics
-from .utils import get_partition_bounds_bytes
+from .utils import (
+ get_partition_bounds_bytes, extract_collision_hash
+)
from .writer import JournalWriter
@@ -158,14 +160,27 @@
except psycopg2.IntegrityError as e:
if e.diag.sqlstate == '23505' and \
e.diag.table_name == 'content':
- constraint_to_hash_name = {
- 'content_pkey': 'sha1',
- 'content_sha1_git_idx': 'sha1_git',
- 'content_sha256_idx': 'sha256',
+ message_detail = e.diag.message_detail
+ if message_detail:
+ hash_name, hash_id = extract_collision_hash(message_detail)
+ collision_content_hashes = [
+ c.hashes() for c in content
+ if getattr(c, hash_name) == hash_id
+ ][0]
+ else:
+ constraint_to_hash_name = {
+ 'content_pkey': 'sha1',
+ 'content_sha1_git_idx': 'sha1_git',
+ 'content_sha256_idx': 'sha256',
}
- colliding_hash_name = constraint_to_hash_name \
- .get(e.diag.constraint_name)
- raise HashCollision(colliding_hash_name) from None
+ hash_name = constraint_to_hash_name \
+ .get(e.diag.constraint_name)
+ hash_id = None
+ collision_content_hashes = None
+
+ raise HashCollision(
+ hash_name, hash_id, collision_content_hashes
+ ) from None
else:
raise
diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_utils.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model.hashutil import hash_to_bytes
+from swh.storage.utils import extract_collision_hash
+
+
+def test_extract_collision_hash():
+ for msg, expected_result in [
+ (
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...',
+ ('sha1', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4689')),
+ ),
+ (
+ 'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa
+ ('sha1_git', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4699')),
+ ),
+ (
+ 'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa
+ ('sha256', hash_to_bytes(
+ '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa
+ ),
+ (
+ 'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa
+ ('blake2s', hash_to_bytes(
+ 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa
+ ),
+ ]:
+ assert extract_collision_hash(msg) == expected_result
+
+ assert extract_collision_hash('Nothing matching') is None
diff --git a/swh/storage/utils.py b/swh/storage/utils.py
--- a/swh/storage/utils.py
+++ b/swh/storage/utils.py
@@ -1,10 +1,14 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import re
+
from typing import Optional, Tuple
+from swh.model.hashutil import hash_to_bytes
+
def _is_power_of_two(n: int) -> bool:
return n > 0 and n & (n-1) == 0
@@ -40,3 +44,25 @@
end = None if i == n-1 \
else (partition_size*(i+1)).to_bytes(nb_bytes, 'big')
return (start, end)
+
+
+def extract_collision_hash(error_message: str) -> Optional[Tuple[str, bytes]]:
+ """Utilities to extract the hash information from a hash collision error.
+
+ Hash collision error message are of the form:
+ 'Key (<hash-type>)=(<double-escaped-hash) already exists.'
+
+ for example:
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) already exists.'
+
+ Return:
+ A formatted string
+
+ """
+ pattern = r'\w* \((?P<type>[a-z0-9_]+)\)=\((?P<id>[\\a-f0-9x]+)\) \w*'
+ result = re.match(pattern, error_message)
+ if result:
+ hash_type = result.group('type')
+ hash_id = result.group('id').replace('\\x', '')
+ return hash_type, hash_to_bytes(hash_id)
+ return None

File Metadata

Mime Type
text/plain
Expires
Wed, Jul 2, 11:43 AM (1 w, 12 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234031

Event Timeline