Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123196
D2872.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
20 KB
Subscribers
None
D2872.diff
View Options
diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py
--- a/swh/storage/__init__.py
+++ b/swh/storage/__init__.py
@@ -6,10 +6,6 @@
import warnings
-class HashCollision(Exception):
- pass
-
-
STORAGE_IMPLEMENTATION = {
'pipeline', 'local', 'remote', 'memory', 'filter', 'buffer', 'retry',
'validate', 'cassandra',
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -8,8 +8,7 @@
from swh.core.api import RPCClient, RemoteException
from swh.model.model import Content
-from .. import HashCollision
-from ..exc import StorageAPIError, StorageArgumentException
+from ..exc import StorageAPIError, StorageArgumentException, HashCollision
from ..interface import StorageInterface
from .serializers import ENCODERS, DECODERS
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -21,8 +21,7 @@
from swh.storage.objstorage import ObjStorage
from swh.storage.writer import JournalWriter
-from .. import HashCollision
-from ..exc import StorageArgumentException
+from ..exc import StorageArgumentException, HashCollision
from .common import TOKEN_BEGIN, TOKEN_END
from .converters import (
revision_to_db, revision_from_db, release_to_db, release_from_db,
diff --git a/swh/storage/exc.py b/swh/storage/exc.py
--- a/swh/storage/exc.py
+++ b/swh/storage/exc.py
@@ -1,8 +1,14 @@
-# Copyright (C) 2015-2016 The Software Heritage developers
+# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Dict, List
+
+from swh.model.hashutil import hash_to_hex
+
+from swh.storage.utils import content_hex_hashes, content_bytes_hashes
+
class StorageDBError(Exception):
"""Specific storage db error (connection, erroneous queries, etc...)
@@ -26,3 +32,18 @@
class StorageArgumentException(Exception):
"""Argument passed to a Storage endpoint is invalid."""
pass
+
+
+class HashCollision(Exception):
+ """Exception raised when a content collides in a storage backend
+
+ """
+ def __init__(self, algo, hash_id, colliding_contents):
+ self.algo = algo
+ self.hash_id = hash_to_hex(hash_id)
+ self.colliding_contents = [content_hex_hashes(c)
+ for c in colliding_contents]
+ super().__init__(self.algo, self.hash_id, self.colliding_contents)
+
+ def colliding_content_hashes(self) -> List[Dict[str, bytes]]:
+ return [content_bytes_hashes(c) for c in self.colliding_contents]
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -25,8 +25,7 @@
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
from swh.storage.objstorage import ObjStorage
-from . import HashCollision
-from .exc import StorageArgumentException
+from .exc import StorageArgumentException, HashCollision
from .converters import origin_url_to_sha1
from .utils import get_partition_bounds_bytes
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -25,10 +25,10 @@
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
from swh.storage.objstorage import ObjStorage
-from . import converters, HashCollision
+from . import converters
from .common import db_transaction_generator, db_transaction
from .db import Db
-from .exc import StorageArgumentException, StorageDBError
+from .exc import StorageArgumentException, StorageDBError, HashCollision
from .algos import diff
from .metrics import timed, send_metric, process_metrics
from .utils import (
diff --git a/swh/storage/tests/test_exception.py b/swh/storage/tests/test_exception.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_exception.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model import hashutil
+
+from swh.storage.exc import HashCollision
+from swh.storage.utils import content_hex_hashes
+
+
+def test_hash_collision_exception():
+ hex_hash_id = "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ hash_id = hashutil.hash_to_bytes(hex_hash_id)
+
+ content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hash_id,
+ }
+
+ exc = HashCollision('sha1', hash_id, [content])
+
+ assert exc.algo == 'sha1'
+ assert exc.hash_id == hex_hash_id
+ assert exc.colliding_contents == [content_hex_hashes(content)]
+
+ assert exc.colliding_content_hashes() == [content]
diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py
--- a/swh/storage/tests/test_retry.py
+++ b/swh/storage/tests/test_retry.py
@@ -13,8 +13,14 @@
Content, Directory, Release, Revision, Snapshot, Origin
)
-from swh.storage import HashCollision, get_storage
-from swh.storage.exc import StorageArgumentException
+from swh.storage import get_storage
+from swh.storage.exc import HashCollision, StorageArgumentException
+
+
+@pytest.fixture
+def fake_hash_collision(sample_data):
+ return HashCollision(
+ 'sha1', "38762cf7f55934b34d179ae6a4c80cadccbb7f0a", [])
@pytest.fixture
@@ -50,7 +56,7 @@
def test_retrying_proxy_storage_content_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -58,7 +64,7 @@
'swh.storage.in_memory.InMemoryStorage.content_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('content hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('content already inserted'),
# ok then!
@@ -126,7 +132,7 @@
def test_retrying_proxy_storage_content_add_metadata_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -134,7 +140,7 @@
'swh.storage.in_memory.InMemoryStorage.content_add_metadata')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('content_metadata hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('content_metadata already inserted'),
# ok then!
@@ -196,7 +202,7 @@
def test_retrying_proxy_swh_storage_origin_add_one_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -205,7 +211,7 @@
'swh.storage.in_memory.InMemoryStorage.origin_add_one')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('origin hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('origin already inserted'),
# ok then!
@@ -270,7 +276,7 @@
def test_retrying_proxy_swh_storage_origin_visit_add_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -281,7 +287,7 @@
'swh.storage.in_memory.InMemoryStorage.origin_visit_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('origin hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('origin already inserted'),
# ok then!
@@ -349,7 +355,7 @@
def test_retrying_proxy_storage_tool_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -358,7 +364,7 @@
'swh.storage.in_memory.InMemoryStorage.tool_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('tool hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('tool already inserted'),
# ok then!
@@ -434,7 +440,7 @@
def test_retrying_proxy_storage_metadata_provider_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -445,7 +451,7 @@
'swh.storage.in_memory.InMemoryStorage.metadata_provider_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('provider_id hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('provider_id already inserted'),
# ok then!
@@ -518,7 +524,7 @@
def test_retrying_proxy_storage_origin_metadata_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -532,7 +538,7 @@
mock_memory.side_effect = [
# first try goes ko
- HashCollision('provider_id hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('provider_id already inserted'),
# ok then!
@@ -614,7 +620,7 @@
def test_retrying_proxy_swh_storage_origin_visit_update_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -625,7 +631,7 @@
'swh.storage.in_memory.InMemoryStorage.origin_visit_update')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('origin hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('origin already inserted'),
# ok then!
@@ -687,7 +693,7 @@
def test_retrying_proxy_storage_directory_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -695,7 +701,7 @@
'swh.storage.in_memory.InMemoryStorage.directory_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('directory hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('directory already inserted'),
# ok then!
@@ -763,7 +769,7 @@
def test_retrying_proxy_storage_revision_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -771,7 +777,7 @@
'swh.storage.in_memory.InMemoryStorage.revision_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('revision hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('revision already inserted'),
# ok then!
@@ -840,7 +846,7 @@
def test_retrying_proxy_storage_release_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -848,7 +854,7 @@
'swh.storage.in_memory.InMemoryStorage.release_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('release hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('release already inserted'),
# ok then!
@@ -917,7 +923,7 @@
def test_retrying_proxy_storage_snapshot_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -925,7 +931,7 @@
'swh.storage.in_memory.InMemoryStorage.snapshot_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('snapshot hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('snapshot already inserted'),
# ok then!
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -30,10 +30,12 @@
Content, OriginVisit, Release, Revision
)
from swh.model.hypothesis_strategies import objects
-from swh.storage import HashCollision, get_storage
+from swh.model.hashutil import hash_to_hex
+from swh.storage import get_storage
from swh.storage.converters import origin_url_to_sha1 as sha1
-from swh.storage.exc import StorageArgumentException
+from swh.storage.exc import HashCollision, StorageArgumentException
from swh.storage.interface import StorageInterface
+from swh.storage.utils import content_hex_hashes
from .storage_data import data
@@ -314,12 +316,18 @@
with pytest.raises(HashCollision) as cm:
swh_storage.content_add([cont1, cont1b])
- actual_algo = cm.value.args[0]
+ exc = cm.value
+ actual_algo = exc.algo
assert actual_algo in ['sha1', 'sha1_git', 'blake2s256']
- actual_id = cm.value.args[1]
- assert actual_id == cont1[actual_algo]
- assert len(cm.value.args[2]) == 2
- assert cm.value.args[2] == [
+ actual_id = exc.hash_id
+ assert actual_id == hash_to_hex(cont1[actual_algo])
+ collisions = exc.args[2]
+ assert len(collisions) == 2
+ assert collisions == [
+ content_hex_hashes(Content.from_dict(cont1).hashes()),
+ content_hex_hashes(Content.from_dict(cont1b).hashes())
+ ]
+ assert exc.colliding_content_hashes() == [
Content.from_dict(cont1).hashes(),
Content.from_dict(cont1b).hashes()
]
@@ -387,12 +395,18 @@
with pytest.raises(HashCollision) as cm:
swh_storage.content_add_metadata([cont1, cont1b])
- actual_algo = cm.value.args[0]
+ exc = cm.value
+ actual_algo = exc.algo
assert actual_algo in ['sha1', 'sha1_git', 'blake2s256']
- actual_id = cm.value.args[1]
- assert actual_id == cont1[actual_algo]
- assert len(cm.value.args[2]) == 2
- assert cm.value.args[2] == [
+ actual_id = exc.hash_id
+ assert actual_id == hash_to_hex(cont1[actual_algo])
+ collisions = exc.args[2]
+ assert len(collisions) == 2
+ assert collisions == [
+ content_hex_hashes(Content.from_dict(cont1).hashes()),
+ content_hex_hashes(Content.from_dict(cont1b).hashes())
+ ]
+ assert exc.colliding_content_hashes() == [
Content.from_dict(cont1).hashes(),
Content.from_dict(cont1b).hashes()
]
diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py
--- a/swh/storage/tests/test_utils.py
+++ b/swh/storage/tests/test_utils.py
@@ -3,33 +3,95 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.model.hashutil import hash_to_bytes
-from swh.storage.utils import extract_collision_hash
+from swh.model import hashutil
+from swh.storage.utils import (
+ extract_collision_hash, content_hex_hashes, content_bytes_hashes
+)
def test_extract_collision_hash():
for msg, expected_result in [
(
'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...',
- ('sha1', hash_to_bytes(
+ ('sha1', hashutil.hash_to_bytes(
'34973274ccef6ab4dfaaf86599792fa9c3fe4689')),
),
(
'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa
- ('sha1_git', hash_to_bytes(
+ ('sha1_git', hashutil.hash_to_bytes(
'34973274ccef6ab4dfaaf86599792fa9c3fe4699')),
),
(
'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa
- ('sha256', hash_to_bytes(
+ ('sha256', hashutil.hash_to_bytes(
'673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa
),
(
'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa
- ('blake2s', hash_to_bytes(
+ ('blake2s', hashutil.hash_to_bytes(
'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa
),
]:
assert extract_collision_hash(msg) == expected_result
assert extract_collision_hash('Nothing matching') is None
+
+
+def test_content_hex_hashes():
+ input_content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hashutil.hash_to_bytes(
+ "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"),
+ }
+
+ expected_content = {
+ "blake2s256":
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2",
+ "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0",
+ "sha256":
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0",
+ "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ }
+
+ actual_content = content_hex_hashes(input_content)
+
+ assert len(actual_content) == len(expected_content)
+ for algo in hashutil.DEFAULT_ALGORITHMS:
+ assert actual_content[algo] == expected_content[algo]
+
+
+def test_content_bytes_hashes():
+ input_content = {
+ "blake2s256":
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2",
+ "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0",
+ "sha256":
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0",
+ "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ }
+
+ expected_content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hashutil.hash_to_bytes(
+ "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"),
+ }
+
+ actual_content = content_bytes_hashes(input_content)
+
+ assert len(actual_content) == len(expected_content)
+ for algo in hashutil.DEFAULT_ALGORITHMS:
+ assert actual_content[algo] == expected_content[algo]
diff --git a/swh/storage/utils.py b/swh/storage/utils.py
--- a/swh/storage/utils.py
+++ b/swh/storage/utils.py
@@ -5,9 +5,11 @@
import re
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import (
+ hash_to_bytes, hash_to_hex, DEFAULT_ALGORITHMS
+)
def _is_power_of_two(n: int) -> bool:
@@ -66,3 +68,21 @@
hash_id = result.group('id')
return hash_type, hash_to_bytes(hash_id)
return None
+
+
+def content_hex_hashes(content: Dict[str, bytes]) -> Dict[str, str]:
+ """Convert bytes hashes into hex hashes.
+
+ """
+ return {
+ algo: hash_to_hex(content[algo]) for algo in DEFAULT_ALGORITHMS
+ }
+
+
+def content_bytes_hashes(content: Dict[str, str]) -> Dict[str, bytes]:
+ """Convert bytes hashes into hex hashes.
+
+ """
+ return {
+ algo: hash_to_bytes(content[algo]) for algo in DEFAULT_ALGORITHMS
+ }
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 2:39 AM (1 d, 22 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214171
Attached To
D2872: storage*: Hex encode content hashes in HashCollision exception
Event Timeline
Log In to Comment