diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py --- a/swh/storage/__init__.py +++ b/swh/storage/__init__.py @@ -6,10 +6,6 @@ import warnings -class HashCollision(Exception): - pass - - STORAGE_IMPLEMENTATION = { 'pipeline', 'local', 'remote', 'memory', 'filter', 'buffer', 'retry', 'validate', 'cassandra', diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -8,8 +8,7 @@ from swh.core.api import RPCClient, RemoteException from swh.model.model import Content -from .. import HashCollision -from ..exc import StorageAPIError, StorageArgumentException +from ..exc import StorageAPIError, StorageArgumentException, HashCollision from ..interface import StorageInterface from .serializers import ENCODERS, DECODERS @@ -33,8 +32,8 @@ and e.args and e.args[0].get('type') == 'HashCollision': # XXX: workaround until we fix these HashCollisions happening # when they shouldn't - raise HashCollision( - *e.args[0]['args']) + algo, hash_id, colliding_contents = e.args[0]['args'] + raise HashCollision(algo, hash_id, colliding_contents) else: raise diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -21,8 +21,7 @@ from swh.storage.objstorage import ObjStorage from swh.storage.writer import JournalWriter -from .. import HashCollision -from ..exc import StorageArgumentException +from ..exc import StorageArgumentException, HashCollision from .common import TOKEN_BEGIN, TOKEN_END from .converters import ( revision_to_db, revision_from_db, release_to_db, release_from_db, diff --git a/swh/storage/exc.py b/swh/storage/exc.py --- a/swh/storage/exc.py +++ b/swh/storage/exc.py @@ -1,8 +1,14 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Dict, List + +from swh.model.hashutil import hash_to_hex + +from swh.storage.utils import content_hex_hashes, content_bytes_hashes + class StorageDBError(Exception): """Specific storage db error (connection, erroneous queries, etc...) @@ -26,3 +32,20 @@ class StorageArgumentException(Exception): """Argument passed to a Storage endpoint is invalid.""" pass + + +class HashCollision(Exception): + """Exception raised when a content collides in a storage backend + + """ + def __init__(self, algo, hash_id, colliding_contents): + super().__init__() + self.algo = algo + self.hash_id = hash_to_hex(hash_id) + self.colliding_contents = [content_hex_hashes(c) + for c in colliding_contents] + # retro-compatibility + self.args = [self.algo, self.hash_id, self.colliding_contents] + + def colliding_content_hashes(self) -> List[Dict[str, bytes]]: + return [content_bytes_hashes(c) for c in self.colliding_contents] diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -25,8 +25,7 @@ from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex from swh.storage.objstorage import ObjStorage -from . import HashCollision -from .exc import StorageArgumentException +from .exc import StorageArgumentException, HashCollision from .converters import origin_url_to_sha1 from .utils import get_partition_bounds_bytes diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -25,10 +25,10 @@ from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex from swh.storage.objstorage import ObjStorage -from . import converters, HashCollision +from . import converters from .common import db_transaction_generator, db_transaction from .db import Db -from .exc import StorageArgumentException, StorageDBError +from .exc import StorageArgumentException, StorageDBError, HashCollision from .algos import diff from .metrics import timed, send_metric, process_metrics from .utils import ( diff --git a/swh/storage/tests/test_exception.py b/swh/storage/tests/test_exception.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/test_exception.py @@ -0,0 +1,34 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.model import hashutil + +from swh.storage.exc import HashCollision +from swh.storage.utils import content_hex_hashes + + +def test_hash_collision_exception(): + hex_hash_id = "38762cf7f55934b34d179ae6a4c80cadccbb7f0a" + hash_id = hashutil.hash_to_bytes(hex_hash_id) + + content = { + "blake2s256": hashutil.hash_to_bytes( + "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2" + ), + "sha1_git": hashutil.hash_to_bytes( + "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"), + "sha256": hashutil.hash_to_bytes( + "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0" + ), + "sha1": hash_id, + } + + exc = HashCollision('sha1', hash_id, [content]) + + assert exc.algo == 'sha1' + assert exc.hash_id == hex_hash_id + assert exc.colliding_contents == [content_hex_hashes(content)] + + assert exc.colliding_content_hashes() == [content] diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py --- a/swh/storage/tests/test_retry.py +++ b/swh/storage/tests/test_retry.py @@ -13,8 +13,14 @@ Content, Directory, Release, Revision, Snapshot, Origin ) -from swh.storage import HashCollision, get_storage -from swh.storage.exc import StorageArgumentException +from swh.storage import get_storage +from swh.storage.exc import HashCollision, StorageArgumentException + + +@pytest.fixture +def fake_hash_collision(sample_data): + return HashCollision( + 'sha1', "38762cf7f55934b34d179ae6a4c80cadccbb7f0a", []) @pytest.fixture @@ -50,7 +56,7 @@ def test_retrying_proxy_storage_content_add_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -58,7 +64,7 @@ 'swh.storage.in_memory.InMemoryStorage.content_add') mock_memory.side_effect = [ # first try goes ko - HashCollision('content hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('content already inserted'), # ok then! @@ -126,7 +132,7 @@ def test_retrying_proxy_storage_content_add_metadata_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -134,7 +140,7 @@ 'swh.storage.in_memory.InMemoryStorage.content_add_metadata') mock_memory.side_effect = [ # first try goes ko - HashCollision('content_metadata hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('content_metadata already inserted'), # ok then! @@ -196,7 +202,7 @@ def test_retrying_proxy_swh_storage_origin_add_one_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -205,7 +211,7 @@ 'swh.storage.in_memory.InMemoryStorage.origin_add_one') mock_memory.side_effect = [ # first try goes ko - HashCollision('origin hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('origin already inserted'), # ok then! @@ -270,7 +276,7 @@ def test_retrying_proxy_swh_storage_origin_visit_add_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -281,7 +287,7 @@ 'swh.storage.in_memory.InMemoryStorage.origin_visit_add') mock_memory.side_effect = [ # first try goes ko - HashCollision('origin hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('origin already inserted'), # ok then! @@ -349,7 +355,7 @@ def test_retrying_proxy_storage_tool_add_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -358,7 +364,7 @@ 'swh.storage.in_memory.InMemoryStorage.tool_add') mock_memory.side_effect = [ # first try goes ko - HashCollision('tool hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('tool already inserted'), # ok then! @@ -434,7 +440,7 @@ def test_retrying_proxy_storage_metadata_provider_add_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -445,7 +451,7 @@ 'swh.storage.in_memory.InMemoryStorage.metadata_provider_add') mock_memory.side_effect = [ # first try goes ko - HashCollision('provider_id hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('provider_id already inserted'), # ok then! @@ -518,7 +524,7 @@ def test_retrying_proxy_storage_origin_metadata_add_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -532,7 +538,7 @@ mock_memory.side_effect = [ # first try goes ko - HashCollision('provider_id hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('provider_id already inserted'), # ok then! @@ -614,7 +620,7 @@ def test_retrying_proxy_swh_storage_origin_visit_update_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -625,7 +631,7 @@ 'swh.storage.in_memory.InMemoryStorage.origin_visit_update') mock_memory.side_effect = [ # first try goes ko - HashCollision('origin hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('origin already inserted'), # ok then! @@ -687,7 +693,7 @@ def test_retrying_proxy_storage_directory_add_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -695,7 +701,7 @@ 'swh.storage.in_memory.InMemoryStorage.directory_add') mock_memory.side_effect = [ # first try goes ko - HashCollision('directory hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('directory already inserted'), # ok then! @@ -763,7 +769,7 @@ def test_retrying_proxy_storage_revision_add_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -771,7 +777,7 @@ 'swh.storage.in_memory.InMemoryStorage.revision_add') mock_memory.side_effect = [ # first try goes ko - HashCollision('revision hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('revision already inserted'), # ok then! @@ -840,7 +846,7 @@ def test_retrying_proxy_storage_release_add_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -848,7 +854,7 @@ 'swh.storage.in_memory.InMemoryStorage.release_add') mock_memory.side_effect = [ # first try goes ko - HashCollision('release hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('release already inserted'), # ok then! @@ -917,7 +923,7 @@ def test_retrying_proxy_storage_snapshot_add_with_retry( - swh_storage, sample_data, mocker): + swh_storage, sample_data, mocker, fake_hash_collision): """Multiple retries for hash collision and psycopg2 error but finally ok """ @@ -925,7 +931,7 @@ 'swh.storage.in_memory.InMemoryStorage.snapshot_add') mock_memory.side_effect = [ # first try goes ko - HashCollision('snapshot hash collision'), + fake_hash_collision, # second try goes ko psycopg2.IntegrityError('snapshot already inserted'), # ok then! diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -30,10 +30,12 @@ Content, OriginVisit, Release, Revision ) from swh.model.hypothesis_strategies import objects -from swh.storage import HashCollision, get_storage +from swh.model.hashutil import hash_to_hex +from swh.storage import get_storage from swh.storage.converters import origin_url_to_sha1 as sha1 -from swh.storage.exc import StorageArgumentException +from swh.storage.exc import HashCollision, StorageArgumentException from swh.storage.interface import StorageInterface +from swh.storage.utils import content_hex_hashes from .storage_data import data @@ -314,12 +316,18 @@ with pytest.raises(HashCollision) as cm: swh_storage.content_add([cont1, cont1b]) - actual_algo = cm.value.args[0] + exc = cm.value + actual_algo = exc.algo assert actual_algo in ['sha1', 'sha1_git', 'blake2s256'] - actual_id = cm.value.args[1] - assert actual_id == cont1[actual_algo] - assert len(cm.value.args[2]) == 2 - assert cm.value.args[2] == [ + actual_id = exc.hash_id + assert actual_id == hash_to_hex(cont1[actual_algo]) + collisions = exc.args[2] + assert len(collisions) == 2 + assert collisions == [ + content_hex_hashes(Content.from_dict(cont1).hashes()), + content_hex_hashes(Content.from_dict(cont1b).hashes()) + ] + assert exc.colliding_content_hashes() == [ Content.from_dict(cont1).hashes(), Content.from_dict(cont1b).hashes() ] @@ -387,12 +395,18 @@ with pytest.raises(HashCollision) as cm: swh_storage.content_add_metadata([cont1, cont1b]) - actual_algo = cm.value.args[0] + exc = cm.value + actual_algo = exc.algo assert actual_algo in ['sha1', 'sha1_git', 'blake2s256'] - actual_id = cm.value.args[1] - assert actual_id == cont1[actual_algo] - assert len(cm.value.args[2]) == 2 - assert cm.value.args[2] == [ + actual_id = exc.hash_id + assert actual_id == hash_to_hex(cont1[actual_algo]) + collisions = exc.args[2] + assert len(collisions) == 2 + assert collisions == [ + content_hex_hashes(Content.from_dict(cont1).hashes()), + content_hex_hashes(Content.from_dict(cont1b).hashes()) + ] + assert exc.colliding_content_hashes() == [ Content.from_dict(cont1).hashes(), Content.from_dict(cont1b).hashes() ] diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py --- a/swh/storage/tests/test_utils.py +++ b/swh/storage/tests/test_utils.py @@ -3,33 +3,95 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.model.hashutil import hash_to_bytes -from swh.storage.utils import extract_collision_hash +from swh.model import hashutil +from swh.storage.utils import ( + extract_collision_hash, content_hex_hashes, content_bytes_hashes +) def test_extract_collision_hash(): for msg, expected_result in [ ( 'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...', - ('sha1', hash_to_bytes( + ('sha1', hashutil.hash_to_bytes( '34973274ccef6ab4dfaaf86599792fa9c3fe4689')), ), ( 'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa - ('sha1_git', hash_to_bytes( + ('sha1_git', hashutil.hash_to_bytes( '34973274ccef6ab4dfaaf86599792fa9c3fe4699')), ), ( 'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa - ('sha256', hash_to_bytes( + ('sha256', hashutil.hash_to_bytes( '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa ), ( 'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa - ('blake2s', hash_to_bytes( + ('blake2s', hashutil.hash_to_bytes( 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa ), ]: assert extract_collision_hash(msg) == expected_result assert extract_collision_hash('Nothing matching') is None + + +def test_content_hex_hashes(): + input_content = { + "blake2s256": hashutil.hash_to_bytes( + "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2" + ), + "sha1_git": hashutil.hash_to_bytes( + "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"), + "sha256": hashutil.hash_to_bytes( + "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0" + ), + "sha1": hashutil.hash_to_bytes( + "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"), + } + + expected_content = { + "blake2s256": + "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2", + "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0", + "sha256": + "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0", + "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a" + } + + actual_content = content_hex_hashes(input_content) + + assert len(actual_content) == len(expected_content) + for algo in hashutil.DEFAULT_ALGORITHMS: + assert actual_content[algo] == expected_content[algo] + + +def test_content_bytes_hashes(): + input_content = { + "blake2s256": + "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2", + "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0", + "sha256": + "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0", + "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a" + } + + expected_content = { + "blake2s256": hashutil.hash_to_bytes( + "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2" + ), + "sha1_git": hashutil.hash_to_bytes( + "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"), + "sha256": hashutil.hash_to_bytes( + "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0" + ), + "sha1": hashutil.hash_to_bytes( + "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"), + } + + actual_content = content_bytes_hashes(input_content) + + assert len(actual_content) == len(expected_content) + for algo in hashutil.DEFAULT_ALGORITHMS: + assert actual_content[algo] == expected_content[algo] diff --git a/swh/storage/utils.py b/swh/storage/utils.py --- a/swh/storage/utils.py +++ b/swh/storage/utils.py @@ -5,9 +5,11 @@ import re -from typing import Optional, Tuple +from typing import Dict, Optional, Tuple -from swh.model.hashutil import hash_to_bytes +from swh.model.hashutil import ( + hash_to_bytes, hash_to_hex, DEFAULT_ALGORITHMS +) def _is_power_of_two(n: int) -> bool: @@ -66,3 +68,21 @@ hash_id = result.group('id') return hash_type, hash_to_bytes(hash_id) return None + + +def content_hex_hashes(content: Dict[str, bytes]) -> Dict[str, str]: + """Convert bytes hashes into hex hashes. + + """ + return { + algo: hash_to_hex(content[algo]) for algo in DEFAULT_ALGORITHMS + } + + +def content_bytes_hashes(content: Dict[str, str]) -> Dict[str, bytes]: + """Convert bytes hashes into hex hashes. + + """ + return { + algo: hash_to_bytes(content[algo]) for algo in DEFAULT_ALGORITHMS + }