Page MenuHomeSoftware Heritage

D2783.id9887.diff
No OneTemporary

D2783.id9887.diff

diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -33,7 +33,10 @@
from .exc import StorageArgumentException, StorageDBError
from .algos import diff
from .metrics import timed, send_metric, process_metrics
-from .utils import get_partition_bounds_bytes
+from .utils import (
+ get_partition_bounds_bytes, extract_collision_hash,
+ content_to_primary_key
+)
from .writer import JournalWriter
@@ -158,14 +161,27 @@
except psycopg2.IntegrityError as e:
if e.diag.sqlstate == '23505' and \
e.diag.table_name == 'content':
- constraint_to_hash_name = {
- 'content_pkey': 'sha1',
- 'content_sha1_git_idx': 'sha1_git',
- 'content_sha256_idx': 'sha256',
+ message_detail = e.diag.message_detail
+ if message_detail:
+ hash_name, hash_id = extract_collision_hash(message_detail)
+ collision_contents_hashes_as_tuple = [
+ content_to_primary_key(c) for c in content
+ if getattr(c, hash_name) == hash_id
+ ]
+ else:
+ constraint_to_hash_name = {
+ 'content_pkey': 'sha1',
+ 'content_sha1_git_idx': 'sha1_git',
+ 'content_sha256_idx': 'sha256',
}
- colliding_hash_name = constraint_to_hash_name \
- .get(e.diag.constraint_name)
- raise HashCollision(colliding_hash_name) from None
+ hash_name = constraint_to_hash_name \
+ .get(e.diag.constraint_name)
+ hash_id = None
+ collision_contents_hashes_as_tuple = None
+
+ raise HashCollision(
+ hash_name, hash_id, collision_contents_hashes_as_tuple
+ ) from None
else:
raise
diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_utils.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model.hashutil import hash_to_bytes, DEFAULT_ALGORITHMS
+from swh.model.model import Content
+from swh.storage.utils import extract_collision_hash, content_to_primary_key
+
+from .storage_data import data
+
+
+def test_extract_collision_hash():
+ for msg, expected_result in [
+ (
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...',
+ ('sha1', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4689')),
+ ),
+ (
+ 'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa
+ ('sha1_git', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4699')),
+ ),
+ (
+ 'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa
+ ('sha256', hash_to_bytes(
+ '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa
+ ),
+ (
+ 'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa
+ ('blake2s', hash_to_bytes(
+ 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa
+ ),
+ ]:
+ assert extract_collision_hash(msg) == expected_result
+
+ assert extract_collision_hash('Nothing matching') is None
+
+
+def test_content_to_primary_key():
+ """Convert contents into a tuple of signatures
+
+ """
+ for content_dict in [data.cont, data.cont2, data.cont3]:
+ c = Content.from_dict(content_dict)
+ expected_hashes = c.hashes()
+
+ actual_pk = content_to_primary_key(c)
+
+ assert len(actual_pk) == len(DEFAULT_ALGORITHMS)
+ for algo, actual_hash in actual_pk:
+ assert algo in DEFAULT_ALGORITHMS
+ assert actual_hash == expected_hashes[algo]
diff --git a/swh/storage/utils.py b/swh/storage/utils.py
--- a/swh/storage/utils.py
+++ b/swh/storage/utils.py
@@ -1,9 +1,14 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Optional, Tuple
+import re
+
+from typing import List, Optional, Tuple
+
+from swh.model.hashutil import hash_to_bytes, DEFAULT_ALGORITHMS
+from swh.model.model import Content
def _is_power_of_two(n: int) -> bool:
@@ -40,3 +45,44 @@
end = None if i == n-1 \
else (partition_size*(i+1)).to_bytes(nb_bytes, 'big')
return (start, end)
+
+
+def extract_collision_hash(error_message: str) -> Optional[Tuple[str, bytes]]:
+ """Utilities to extract the hash information from a hash collision error.
+
+ Hash collision error message are of the form:
+ 'Key (<hash-type>)=(<double-escaped-hash) already exists.'
+
+ for example:
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) already exists.'
+
+ Return:
+ A formatted string
+
+ """
+ pattern = r'\w* \((?P<type>[a-z0-9_]+)\)=\((?P<id>[\\a-f0-9x]+)\) \w*'
+ result = re.match(pattern, error_message)
+ if result:
+ hash_type = result.group('type')
+ hash_id = result.group('id').replace('\\x', '')
+ return hash_type, hash_to_bytes(hash_id)
+ return None
+
+
+def content_to_primary_key(content: Content) -> List[
+ Tuple[str, bytes]
+]:
+ """Convert a content to its primary key representation.
+
+ Args:
+ content: Content object
+
+ Returns
+ a tuple representing its primary key
+
+ """
+ primary_key = []
+ for algo in DEFAULT_ALGORITHMS:
+ tuple_key: Tuple[str, bytes] = (algo, getattr(content, algo))
+ primary_key.append(tuple_key)
+ return primary_key

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:15 PM (5 d, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233812

Event Timeline