diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -17,9 +17,10 @@ Revision, Release, Directory, DirectoryEntry, Content, SkippedContent, OriginVisit, Snapshot, Origin ) -from swh.model.hashutil import DEFAULT_ALGORITHMS +from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_hex from swh.storage.objstorage import ObjStorage from swh.storage.writer import JournalWriter +from swh.storage.utils import content_hex_hashes from .. import HashCollision from ..exc import StorageArgumentException @@ -120,7 +121,8 @@ if collisions: collisions.append(content.hashes()) raise HashCollision( - algo, content.get_hash(algo), collisions) + algo, hash_to_hex(content.get_hash(algo)), + [content_hex_hashes(c) for c in collisions]) (token, insertion_finalizer) = \ self._cql_runner.content_add_prepare(content) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -29,7 +29,7 @@ from .exc import StorageArgumentException from .converters import origin_url_to_sha1 -from .utils import get_partition_bounds_bytes +from .utils import get_partition_bounds_bytes, content_hex_hashes from .writer import JournalWriter # Max block size of contents to return @@ -102,7 +102,9 @@ # Add the new colliding content colliding_content_hashes.append(content.hashes()) raise HashCollision( - algorithm, hash_, colliding_content_hashes) + algorithm, hash_to_hex(hash_), + [content_hex_hashes(c) + for c in colliding_content_hashes]) for algorithm in DEFAULT_ALGORITHMS: hash_ = content.get_hash(algorithm) self._content_indexes[algorithm][hash_].add(key) diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -32,7 +32,7 @@ from .algos import diff from .metrics import timed, send_metric, process_metrics from .utils import ( - get_partition_bounds_bytes, extract_collision_hash + get_partition_bounds_bytes, extract_collision_hash, content_hex_hashes ) from .writer import JournalWriter @@ -177,7 +177,10 @@ collision_contents_hashes = None raise HashCollision( - hash_name, hash_id, collision_contents_hashes + hash_name, hash_id, [ + content_hex_hashes(c) + for c in collision_contents_hashes + ] ) from None else: raise diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -30,10 +30,12 @@ Content, OriginVisit, Release, Revision ) from swh.model.hypothesis_strategies import objects +from swh.model.hashutil import hash_to_hex from swh.storage import HashCollision, get_storage from swh.storage.converters import origin_url_to_sha1 as sha1 from swh.storage.exc import StorageArgumentException from swh.storage.interface import StorageInterface +from swh.storage.utils import content_hex_hashes from .storage_data import data @@ -317,11 +319,11 @@ actual_algo = cm.value.args[0] assert actual_algo in ['sha1', 'sha1_git', 'blake2s256'] actual_id = cm.value.args[1] - assert actual_id == cont1[actual_algo] + assert actual_id == hash_to_hex(cont1[actual_algo]) assert len(cm.value.args[2]) == 2 assert cm.value.args[2] == [ - Content.from_dict(cont1).hashes(), - Content.from_dict(cont1b).hashes() + content_hex_hashes(Content.from_dict(cont1).hashes()), + content_hex_hashes(Content.from_dict(cont1b).hashes()) ] def test_content_update(self, swh_storage): @@ -390,11 +392,11 @@ actual_algo = cm.value.args[0] assert actual_algo in ['sha1', 'sha1_git', 'blake2s256'] actual_id = cm.value.args[1] - assert actual_id == cont1[actual_algo] + assert actual_id == hash_to_hex(cont1[actual_algo]) assert len(cm.value.args[2]) == 2 assert cm.value.args[2] == [ - Content.from_dict(cont1).hashes(), - Content.from_dict(cont1b).hashes() + content_hex_hashes(Content.from_dict(cont1).hashes()), + content_hex_hashes(Content.from_dict(cont1b).hashes()) ] def test_skipped_content_add(self, swh_storage): diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py --- a/swh/storage/tests/test_utils.py +++ b/swh/storage/tests/test_utils.py @@ -3,33 +3,62 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.model.hashutil import hash_to_bytes -from swh.storage.utils import extract_collision_hash +from swh.model import hashutil +from swh.storage.utils import extract_collision_hash, content_hex_hashes def test_extract_collision_hash(): for msg, expected_result in [ ( 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...', - ('sha1', hash_to_bytes( - '34973274ccef6ab4dfaaf86599792fa9c3fe4689')), + ('sha1', '34973274ccef6ab4dfaaf86599792fa9c3fe4689'), ), ( 'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa - ('sha1_git', hash_to_bytes( - '34973274ccef6ab4dfaaf86599792fa9c3fe4699')), + ('sha1_git', + '34973274ccef6ab4dfaaf86599792fa9c3fe4699'), ), ( 'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa - ('sha256', hash_to_bytes( - '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa + ('sha256', + '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a') # noqa ), ( 'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa - ('blake2s', hash_to_bytes( - 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa + ('blake2s', + 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d') # noqa ), ]: assert extract_collision_hash(msg) == expected_result assert extract_collision_hash('Nothing matching') is None + + +def test_content_hex_hashes(): + input_content = { + "blake2s256": hashutil.hash_to_bytes( + "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2" + ), + "sha1_git": hashutil.hash_to_bytes( + "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"), + "sha256": hashutil.hash_to_bytes( + "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0" + ), + "sha1": hashutil.hash_to_bytes( + "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"), + } + + expected_content = { + "blake2s256": + "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2", + "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0", + "sha256": + "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0", + "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a" + } + + actual_content = content_hex_hashes(input_content) + + assert len(actual_content) == len(expected_content) + for algo in hashutil.DEFAULT_ALGORITHMS: + assert actual_content[algo] == expected_content[algo] diff --git a/swh/storage/utils.py b/swh/storage/utils.py --- a/swh/storage/utils.py +++ b/swh/storage/utils.py @@ -5,9 +5,9 @@ import re -from typing import Optional, Tuple +from typing import Dict, Optional, Tuple -from swh.model.hashutil import hash_to_bytes +from swh.model.hashutil import hash_to_hex, DEFAULT_ALGORITHMS def _is_power_of_two(n: int) -> bool: @@ -46,7 +46,7 @@ return (start, end) -def extract_collision_hash(error_message: str) -> Optional[Tuple[str, bytes]]: +def extract_collision_hash(error_message: str) -> Optional[Tuple[str, str]]: """Utilities to extract the hash information from a hash collision error. Hash collision error message are of the form: @@ -64,5 +64,14 @@ if result: hash_type = result.group('type') hash_id = result.group('id') - return hash_type, hash_to_bytes(hash_id) + return hash_type, hash_id return None + + +def content_hex_hashes(content: Dict[str, bytes]) -> Dict[str, str]: + """Convert bytes hashes into hex hashes. + + """ + return { + algo: hash_to_hex(content[algo]) for algo in DEFAULT_ALGORITHMS + }