Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345255
D2783.id9887.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D2783.id9887.diff
View Options
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -33,7 +33,10 @@
from .exc import StorageArgumentException, StorageDBError
from .algos import diff
from .metrics import timed, send_metric, process_metrics
-from .utils import get_partition_bounds_bytes
+from .utils import (
+ get_partition_bounds_bytes, extract_collision_hash,
+ content_to_primary_key
+)
from .writer import JournalWriter
@@ -158,14 +161,27 @@
except psycopg2.IntegrityError as e:
if e.diag.sqlstate == '23505' and \
e.diag.table_name == 'content':
- constraint_to_hash_name = {
- 'content_pkey': 'sha1',
- 'content_sha1_git_idx': 'sha1_git',
- 'content_sha256_idx': 'sha256',
+ message_detail = e.diag.message_detail
+ if message_detail:
+ hash_name, hash_id = extract_collision_hash(message_detail)
+ collision_contents_hashes_as_tuple = [
+ content_to_primary_key(c) for c in content
+ if getattr(c, hash_name) == hash_id
+ ]
+ else:
+ constraint_to_hash_name = {
+ 'content_pkey': 'sha1',
+ 'content_sha1_git_idx': 'sha1_git',
+ 'content_sha256_idx': 'sha256',
}
- colliding_hash_name = constraint_to_hash_name \
- .get(e.diag.constraint_name)
- raise HashCollision(colliding_hash_name) from None
+ hash_name = constraint_to_hash_name \
+ .get(e.diag.constraint_name)
+ hash_id = None
+ collision_contents_hashes_as_tuple = None
+
+ raise HashCollision(
+ hash_name, hash_id, collision_contents_hashes_as_tuple
+ ) from None
else:
raise
diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_utils.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model.hashutil import hash_to_bytes, DEFAULT_ALGORITHMS
+from swh.model.model import Content
+from swh.storage.utils import extract_collision_hash, content_to_primary_key
+
+from .storage_data import data
+
+
+def test_extract_collision_hash():
+ for msg, expected_result in [
+ (
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...',
+ ('sha1', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4689')),
+ ),
+ (
+ 'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa
+ ('sha1_git', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4699')),
+ ),
+ (
+ 'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa
+ ('sha256', hash_to_bytes(
+ '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa
+ ),
+ (
+ 'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa
+ ('blake2s', hash_to_bytes(
+ 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa
+ ),
+ ]:
+ assert extract_collision_hash(msg) == expected_result
+
+ assert extract_collision_hash('Nothing matching') is None
+
+
+def test_content_to_primary_key():
+ """Convert contents into a tuple of signatures
+
+ """
+ for content_dict in [data.cont, data.cont2, data.cont3]:
+ c = Content.from_dict(content_dict)
+ expected_hashes = c.hashes()
+
+ actual_pk = content_to_primary_key(c)
+
+ assert len(actual_pk) == len(DEFAULT_ALGORITHMS)
+ for algo, actual_hash in actual_pk:
+ assert algo in DEFAULT_ALGORITHMS
+ assert actual_hash == expected_hashes[algo]
diff --git a/swh/storage/utils.py b/swh/storage/utils.py
--- a/swh/storage/utils.py
+++ b/swh/storage/utils.py
@@ -1,9 +1,14 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Optional, Tuple
+import re
+
+from typing import List, Optional, Tuple
+
+from swh.model.hashutil import hash_to_bytes, DEFAULT_ALGORITHMS
+from swh.model.model import Content
def _is_power_of_two(n: int) -> bool:
@@ -40,3 +45,44 @@
end = None if i == n-1 \
else (partition_size*(i+1)).to_bytes(nb_bytes, 'big')
return (start, end)
+
+
+def extract_collision_hash(error_message: str) -> Optional[Tuple[str, bytes]]:
+ """Utilities to extract the hash information from a hash collision error.
+
+ Hash collision error message are of the form:
+ 'Key (<hash-type>)=(<double-escaped-hash) already exists.'
+
+ for example:
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) already exists.'
+
+ Return:
+ A formatted string
+
+ """
+ pattern = r'\w* \((?P<type>[a-z0-9_]+)\)=\((?P<id>[\\a-f0-9x]+)\) \w*'
+ result = re.match(pattern, error_message)
+ if result:
+ hash_type = result.group('type')
+ hash_id = result.group('id').replace('\\x', '')
+ return hash_type, hash_to_bytes(hash_id)
+ return None
+
+
+def content_to_primary_key(content: Content) -> List[
+ Tuple[str, bytes]
+]:
+ """Convert a content to its primary key representation.
+
+ Args:
+ content: Content object
+
+ Returns
+ a tuple representing its primary key
+
+ """
+ primary_key = []
+ for algo in DEFAULT_ALGORITHMS:
+ tuple_key: Tuple[str, bytes] = (algo, getattr(content, algo))
+ primary_key.append(tuple_key)
+ return primary_key
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:15 PM (5 d, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233812
Attached To
D2783: storage(s): Identify and provide the collision hash(es) in HashCollision exception
Event Timeline
Log In to Comment