diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -33,7 +33,10 @@ from .exc import StorageArgumentException, StorageDBError from .algos import diff from .metrics import timed, send_metric, process_metrics -from .utils import get_partition_bounds_bytes +from .utils import ( + get_partition_bounds_bytes, extract_collision_hash, + content_to_primary_key +) from .writer import JournalWriter @@ -158,14 +161,27 @@ except psycopg2.IntegrityError as e: if e.diag.sqlstate == '23505' and \ e.diag.table_name == 'content': - constraint_to_hash_name = { - 'content_pkey': 'sha1', - 'content_sha1_git_idx': 'sha1_git', - 'content_sha256_idx': 'sha256', + message_detail = e.diag.message_detail + if message_detail: + hash_name, hash_id = extract_collision_hash(message_detail) + collision_contents_hashes_as_tuple = [ + content_to_primary_key(c) for c in content + if getattr(c, hash_name) == hash_id + ] + else: + constraint_to_hash_name = { + 'content_pkey': 'sha1', + 'content_sha1_git_idx': 'sha1_git', + 'content_sha256_idx': 'sha256', } - colliding_hash_name = constraint_to_hash_name \ - .get(e.diag.constraint_name) - raise HashCollision(colliding_hash_name) from None + hash_name = constraint_to_hash_name \ + .get(e.diag.constraint_name) + hash_id = None + collision_contents_hashes_as_tuple = None + + raise HashCollision( + hash_name, hash_id, collision_contents_hashes_as_tuple + ) from None else: raise diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/test_utils.py @@ -0,0 +1,54 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.model.hashutil import hash_to_bytes, DEFAULT_ALGORITHMS +from swh.model.model import Content +from swh.storage.utils import extract_collision_hash, content_to_primary_key + +from .storage_data import data + + +def test_extract_collision_hash(): + for msg, expected_result in [ + ( + 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...', + ('sha1', hash_to_bytes( + '34973274ccef6ab4dfaaf86599792fa9c3fe4689')), + ), + ( + 'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa + ('sha1_git', hash_to_bytes( + '34973274ccef6ab4dfaaf86599792fa9c3fe4699')), + ), + ( + 'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa + ('sha256', hash_to_bytes( + '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa + ), + ( + 'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa + ('blake2s', hash_to_bytes( + 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa + ), + ]: + assert extract_collision_hash(msg) == expected_result + + assert extract_collision_hash('Nothing matching') is None + + +def test_content_to_primary_key(): + """Convert contents into a tuple of signatures + + """ + for content_dict in [data.cont, data.cont2, data.cont3]: + c = Content.from_dict(content_dict) + expected_hashes = c.hashes() + + actual_pk = content_to_primary_key(c) + + assert len(actual_pk) == len(DEFAULT_ALGORITHMS) + for algo, actual_hash in actual_pk: + assert algo in DEFAULT_ALGORITHMS + assert actual_hash == expected_hashes[algo] diff --git a/swh/storage/utils.py b/swh/storage/utils.py --- a/swh/storage/utils.py +++ b/swh/storage/utils.py @@ -1,9 +1,14 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Optional, Tuple +import re + +from typing import List, Optional, Tuple + +from swh.model.hashutil import hash_to_bytes, DEFAULT_ALGORITHMS +from swh.model.model import Content def _is_power_of_two(n: int) -> bool: @@ -40,3 +45,44 @@ end = None if i == n-1 \ else (partition_size*(i+1)).to_bytes(nb_bytes, 'big') return (start, end) + + +def extract_collision_hash(error_message: str) -> Optional[Tuple[str, bytes]]: + """Utilities to extract the hash information from a hash collision error. + + Hash collision error message are of the form: + 'Key ()=([a-z0-9_]+)\)=\((?P[\\a-f0-9x]+)\) \w*' + result = re.match(pattern, error_message) + if result: + hash_type = result.group('type') + hash_id = result.group('id').replace('\\x', '') + return hash_type, hash_to_bytes(hash_id) + return None + + +def content_to_primary_key(content: Content) -> List[ + Tuple[str, bytes] +]: + """Convert a content to its primary key representation. + + Args: + content: Content object + + Returns + a tuple representing its primary key + + """ + primary_key = [] + for algo in DEFAULT_ALGORITHMS: + tuple_key: Tuple[str, bytes] = (algo, getattr(content, algo)) + primary_key.append(tuple_key) + return primary_key