Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066556
D2783.id9938.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D2783.id9938.diff
View Options
diff --git a/swh/storage/cassandra/converters.py b/swh/storage/cassandra/converters.py
--- a/swh/storage/cassandra/converters.py
+++ b/swh/storage/cassandra/converters.py
@@ -8,12 +8,15 @@
import attr
+from typing import Dict
+
from swh.model.model import (
RevisionType, ObjectType, Revision, Release,
)
-
+from swh.model.hashutil import DEFAULT_ALGORITHMS
from ..converters import git_headers_to_db, db_to_git_headers
+from .common import Row
def revision_to_db(revision: Revision) -> Revision:
@@ -61,3 +64,13 @@
target_type=ObjectType(release.target_type),
)
return release
+
+
+def row_to_content_hashes(row: Row) -> Dict[str, bytes]:
+ """Convert cassandra row to a content hashes
+
+ """
+ hashes = {}
+ for algo in DEFAULT_ALGORITHMS:
+ hashes[algo] = getattr(row, algo)
+ return hashes
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -26,6 +26,7 @@
from .common import TOKEN_BEGIN, TOKEN_END
from .converters import (
revision_to_db, revision_from_db, release_to_db, release_from_db,
+ row_to_content_hashes,
)
from .cql import CqlRunner
from .schema import HASH_ALGORITHMS
@@ -93,7 +94,11 @@
algo, content.get_hash(algo))
if len(pks) > 1:
# There are more than the one we just inserted.
- raise HashCollision(algo, content.get_hash(algo), pks)
+ colliding_content_hashes = [
+ row_to_content_hashes(pk) for pk in pks
+ ]
+ raise HashCollision(
+ algo, content.get_hash(algo), colliding_content_hashes)
summary = {
'content:add': content_add,
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -92,7 +92,16 @@
hash_ = content.get_hash(algorithm)
if hash_ in self._content_indexes[algorithm]\
and (algorithm not in {'blake2s256', 'sha256'}):
- raise HashCollision(algorithm, hash_, key)
+ colliding_content_hashes = []
+ # Add the already stored contents
+ for content_hashes_set in self._content_indexes[
+ algorithm][hash_]:
+ hashes = dict(content_hashes_set)
+ colliding_content_hashes.append(hashes)
+ # Add the new colliding content
+ colliding_content_hashes.append(content.hashes())
+ raise HashCollision(
+ algorithm, hash_, colliding_content_hashes)
for algorithm in DEFAULT_ALGORITHMS:
hash_ = content.get_hash(algorithm)
self._content_indexes[algorithm][hash_].add(key)
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -33,7 +33,9 @@
from .exc import StorageArgumentException, StorageDBError
from .algos import diff
from .metrics import timed, send_metric, process_metrics
-from .utils import get_partition_bounds_bytes
+from .utils import (
+ get_partition_bounds_bytes, extract_collision_hash
+)
from .writer import JournalWriter
@@ -158,14 +160,27 @@
except psycopg2.IntegrityError as e:
if e.diag.sqlstate == '23505' and \
e.diag.table_name == 'content':
- constraint_to_hash_name = {
- 'content_pkey': 'sha1',
- 'content_sha1_git_idx': 'sha1_git',
- 'content_sha256_idx': 'sha256',
+ message_detail = e.diag.message_detail
+ if message_detail:
+ hash_name, hash_id = extract_collision_hash(message_detail)
+ collision_contents_hashes = [
+ c.hashes() for c in content
+ if c.get_hash(hash_name) == hash_id
+ ]
+ else:
+ constraint_to_hash_name = {
+ 'content_pkey': 'sha1',
+ 'content_sha1_git_idx': 'sha1_git',
+ 'content_sha256_idx': 'sha256',
}
- colliding_hash_name = constraint_to_hash_name \
- .get(e.diag.constraint_name)
- raise HashCollision(colliding_hash_name) from None
+ hash_name = constraint_to_hash_name \
+ .get(e.diag.constraint_name)
+ hash_id = None
+ collision_contents_hashes = None
+
+ raise HashCollision(
+ hash_name, hash_id, collision_contents_hashes
+ ) from None
else:
raise
diff --git a/swh/storage/tests/test_cassandra_converters.py b/swh/storage/tests/test_cassandra_converters.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_cassandra_converters.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import namedtuple
+from typing import List
+
+from swh.storage.cassandra import converters
+
+from swh.model.hashutil import DEFAULT_ALGORITHMS
+
+
+# Test purposes
+field_names: List[str] = list(DEFAULT_ALGORITHMS)
+Row = namedtuple('Row', field_names) # type: ignore
+
+
+def test_row_to_content_hashes():
+ for row in [Row(
+ sha1=b'4\x972t\xcc\xefj\xb4\xdf\xaa\xf8e\x99y/\xa9\xc3\xfeF\x89',
+ sha1_git=b'\xd8\x1c\xc0q\x0e\xb6\xcf\x9e\xfd[\x92\n\x84S\xe1\xe0qW\xb6\xcd', # noqa
+ sha256=b'g6P\xf96\xcb;\n/\x93\xce\t\xd8\x1b\xe1\x07H\xb1\xb2\x03\xc1\x9e\x81v\xb4\xee\xfc\x19d\xa0\xcf:', # noqa
+ blake2s256=b"\xd5\xfe\x199We'\xe4,\xfdv\xa9EZ$2\xfe\x7fVf\x95dW}\xd9<B\x80\xe7mf\x1d", # noqa
+ ),
+ Row(
+ sha1=b'4\x972t\xcc\xefj\xb4\xdf\xaa\xf8e\x99y/\xa9\xc3\xfeF\x89', # noqa
+ sha1_git=b'\xd8\x1c\xc0q\x0e\xb6\xcf\x9e\xfd[\x92\n\x84S\xe1\xe0qW\xb6\xcd', # noqa
+ sha256=b'h6P\xf96\xcb;\n/\x93\xce\t\xd8\x1b\xe1\x07H\xb1\xb2\x03\xc1\x9e\x81v\xb4\xee\xfc\x19d\xa0\xcf:', # noqa
+ blake2s256=b"\xd5\xfe\x199We'\xe4,\xfdv\xa9EZ$2\xfe\x7fVf\x95dW}\xd9<B\x80\xe7mf\x1d", # noqa
+ ),
+ ]:
+ actual_hashes = converters.row_to_content_hashes(row)
+
+ assert len(actual_hashes) == len(DEFAULT_ALGORITHMS)
+ for algo in DEFAULT_ALGORITHMS:
+ assert actual_hashes[algo] == getattr(row, algo)
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -312,7 +312,15 @@
with pytest.raises(HashCollision) as cm:
swh_storage.content_add([cont1, cont1b])
- assert cm.value.args[0] in ['sha1', 'sha1_git', 'blake2s256']
+ actual_algo = cm.value.args[0]
+ assert actual_algo in ['sha1', 'sha1_git', 'blake2s256']
+ actual_id = cm.value.args[1]
+ assert actual_id == cont1[actual_algo]
+ assert len(cm.value.args[2]) == 2
+ assert cm.value.args[2] == [
+ Content.from_dict(cont1).hashes(),
+ Content.from_dict(cont1b).hashes()
+ ]
def test_content_update(self, swh_storage):
if hasattr(swh_storage, 'storage'):
@@ -370,14 +378,22 @@
# create (corrupted) content with same sha1{,_git} but != sha256
cont1b = cont1.copy()
- sha256_array = bytearray(cont1b['sha256'])
- sha256_array[0] += 1
- cont1b['sha256'] = bytes(sha256_array)
+ sha1_git_array = bytearray(cont1b['sha256'])
+ sha1_git_array[0] += 1
+ cont1b['sha256'] = bytes(sha1_git_array)
with pytest.raises(HashCollision) as cm:
swh_storage.content_add_metadata([cont1, cont1b])
- assert cm.value.args[0] in ['sha1', 'sha1_git', 'blake2s256']
+ actual_algo = cm.value.args[0]
+ assert actual_algo in ['sha1', 'sha1_git', 'blake2s256']
+ actual_id = cm.value.args[1]
+ assert actual_id == cont1[actual_algo]
+ assert len(cm.value.args[2]) == 2
+ assert cm.value.args[2] == [
+ Content.from_dict(cont1).hashes(),
+ Content.from_dict(cont1b).hashes()
+ ]
def test_skipped_content_add(self, swh_storage):
cont = data.skipped_cont
diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_utils.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model.hashutil import hash_to_bytes
+from swh.storage.utils import extract_collision_hash
+
+
+def test_extract_collision_hash():
+ for msg, expected_result in [
+ (
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...',
+ ('sha1', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4689')),
+ ),
+ (
+ 'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa
+ ('sha1_git', hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4699')),
+ ),
+ (
+ 'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa
+ ('sha256', hash_to_bytes(
+ '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa
+ ),
+ (
+ 'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa
+ ('blake2s', hash_to_bytes(
+ 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa
+ ),
+ ]:
+ assert extract_collision_hash(msg) == expected_result
+
+ assert extract_collision_hash('Nothing matching') is None
diff --git a/swh/storage/utils.py b/swh/storage/utils.py
--- a/swh/storage/utils.py
+++ b/swh/storage/utils.py
@@ -1,10 +1,14 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import re
+
from typing import Optional, Tuple
+from swh.model.hashutil import hash_to_bytes
+
def _is_power_of_two(n: int) -> bool:
return n > 0 and n & (n-1) == 0
@@ -40,3 +44,25 @@
end = None if i == n-1 \
else (partition_size*(i+1)).to_bytes(nb_bytes, 'big')
return (start, end)
+
+
+def extract_collision_hash(error_message: str) -> Optional[Tuple[str, bytes]]:
+ """Utilities to extract the hash information from a hash collision error.
+
+ Hash collision error message are of the form:
+ 'Key (<hash-type>)=(<double-escaped-hash) already exists.'
+
+ for example:
+ 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) already exists.'
+
+ Return:
+ A formatted string
+
+ """
+ pattern = r'\w* \((?P<type>[^)]+)\)=\(\\x(?P<id>[a-f0-9]+)\) \w*'
+ result = re.match(pattern, error_message)
+ if result:
+ hash_type = result.group('type')
+ hash_id = result.group('id')
+ return hash_type, hash_to_bytes(hash_id)
+ return None
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 3:05 PM (11 w, 15 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219334
Attached To
D2783: storage(s): Identify and provide the collision hash(es) in HashCollision exception
Event Timeline
Log In to Comment