Page MenuHomeSoftware Heritage

D2872.id10237.diff
No OneTemporary

D2872.id10237.diff

diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py
--- a/swh/storage/__init__.py
+++ b/swh/storage/__init__.py
@@ -6,10 +6,6 @@
import warnings
-class HashCollision(Exception):
- pass
-
-
STORAGE_IMPLEMENTATION = {
'pipeline', 'local', 'remote', 'memory', 'filter', 'buffer', 'retry',
'validate', 'cassandra',
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -8,8 +8,7 @@
from swh.core.api import RPCClient, RemoteException
from swh.model.model import Content
-from .. import HashCollision
-from ..exc import StorageAPIError, StorageArgumentException
+from ..exc import StorageAPIError, StorageArgumentException, HashCollision
from ..interface import StorageInterface
from .serializers import ENCODERS, DECODERS
@@ -33,8 +32,8 @@
and e.args and e.args[0].get('type') == 'HashCollision':
# XXX: workaround until we fix these HashCollisions happening
# when they shouldn't
- raise HashCollision(
- *e.args[0]['args'])
+ algo, hash_id, colliding_contents = e.args[0]['args']
+ raise HashCollision(algo, hash_id, colliding_contents)
else:
raise
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -21,8 +21,7 @@
from swh.storage.objstorage import ObjStorage
from swh.storage.writer import JournalWriter
-from .. import HashCollision
-from ..exc import StorageArgumentException
+from ..exc import StorageArgumentException, HashCollision
from .common import TOKEN_BEGIN, TOKEN_END
from .converters import (
revision_to_db, revision_from_db, release_to_db, release_from_db,
diff --git a/swh/storage/exc.py b/swh/storage/exc.py
--- a/swh/storage/exc.py
+++ b/swh/storage/exc.py
@@ -1,8 +1,14 @@
-# Copyright (C) 2015-2016 The Software Heritage developers
+# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Dict, List
+
+from swh.model.hashutil import hash_to_hex
+
+from swh.storage.utils import content_hex_hashes, content_bytes_hashes
+
class StorageDBError(Exception):
"""Specific storage db error (connection, erroneous queries, etc...)
@@ -26,3 +32,20 @@
class StorageArgumentException(Exception):
"""Argument passed to a Storage endpoint is invalid."""
pass
+
+
+class HashCollision(Exception):
+ """Exception raised when a content collides in a storage backend
+
+ """
+ def __init__(self, algo, hash_id, colliding_contents):
+ super().__init__()
+ self.algo = algo
+ self.hash_id = hash_to_hex(hash_id)
+ self.colliding_contents = [content_hex_hashes(c)
+ for c in colliding_contents]
+ # retro-compatibility
+ self.args = [self.algo, self.hash_id, self.colliding_contents]
+
+ def colliding_content_hashes(self) -> List[Dict[str, bytes]]:
+ return [content_bytes_hashes(c) for c in self.colliding_contents]
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -25,8 +25,7 @@
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
from swh.storage.objstorage import ObjStorage
-from . import HashCollision
-from .exc import StorageArgumentException
+from .exc import StorageArgumentException, HashCollision
from .converters import origin_url_to_sha1
from .utils import get_partition_bounds_bytes
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -25,10 +25,10 @@
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
from swh.storage.objstorage import ObjStorage
-from . import converters, HashCollision
+from . import converters
from .common import db_transaction_generator, db_transaction
from .db import Db
-from .exc import StorageArgumentException, StorageDBError
+from .exc import StorageArgumentException, StorageDBError, HashCollision
from .algos import diff
from .metrics import timed, send_metric, process_metrics
from .utils import (
diff --git a/swh/storage/tests/test_exception.py b/swh/storage/tests/test_exception.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_exception.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model import hashutil
+
+from swh.storage.exc import HashCollision
+from swh.storage.utils import content_hex_hashes
+
+
+def test_hash_collision_exception():
+ hex_hash_id = "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ hash_id = hashutil.hash_to_bytes(hex_hash_id)
+
+ content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hash_id,
+ }
+
+ exc = HashCollision('sha1', hash_id, [content])
+
+ assert exc.algo == 'sha1'
+ assert exc.hash_id == hex_hash_id
+ assert exc.colliding_contents == [content_hex_hashes(content)]
+
+ assert exc.colliding_content_hashes() == [content]
diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py
--- a/swh/storage/tests/test_retry.py
+++ b/swh/storage/tests/test_retry.py
@@ -13,8 +13,8 @@
Content, Directory, Release, Revision, Snapshot, Origin
)
-from swh.storage import HashCollision, get_storage
-from swh.storage.exc import StorageArgumentException
+from swh.storage import get_storage
+from swh.storage.exc import HashCollision, StorageArgumentException
@pytest.fixture
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -30,10 +30,12 @@
Content, OriginVisit, Release, Revision
)
from swh.model.hypothesis_strategies import objects
-from swh.storage import HashCollision, get_storage
+from swh.model.hashutil import hash_to_hex
+from swh.storage import get_storage
from swh.storage.converters import origin_url_to_sha1 as sha1
-from swh.storage.exc import StorageArgumentException
+from swh.storage.exc import HashCollision, StorageArgumentException
from swh.storage.interface import StorageInterface
+from swh.storage.utils import content_hex_hashes
from .storage_data import data
@@ -314,12 +316,18 @@
with pytest.raises(HashCollision) as cm:
swh_storage.content_add([cont1, cont1b])
- actual_algo = cm.value.args[0]
+ exc = cm.value
+ actual_algo = exc.algo
assert actual_algo in ['sha1', 'sha1_git', 'blake2s256']
- actual_id = cm.value.args[1]
- assert actual_id == cont1[actual_algo]
- assert len(cm.value.args[2]) == 2
- assert cm.value.args[2] == [
+ actual_id = exc.hash_id
+ assert actual_id == hash_to_hex(cont1[actual_algo])
+ collisions = exc.args[2]
+ assert len(collisions) == 2
+ assert collisions == [
+ content_hex_hashes(Content.from_dict(cont1).hashes()),
+ content_hex_hashes(Content.from_dict(cont1b).hashes())
+ ]
+ assert exc.colliding_content_hashes() == [
Content.from_dict(cont1).hashes(),
Content.from_dict(cont1b).hashes()
]
@@ -387,12 +395,18 @@
with pytest.raises(HashCollision) as cm:
swh_storage.content_add_metadata([cont1, cont1b])
- actual_algo = cm.value.args[0]
+ exc = cm.value
+ actual_algo = exc.algo
assert actual_algo in ['sha1', 'sha1_git', 'blake2s256']
- actual_id = cm.value.args[1]
- assert actual_id == cont1[actual_algo]
- assert len(cm.value.args[2]) == 2
- assert cm.value.args[2] == [
+ actual_id = exc.hash_id
+ assert actual_id == hash_to_hex(cont1[actual_algo])
+ collisions = exc.args[2]
+ assert len(collisions) == 2
+ assert collisions == [
+ content_hex_hashes(Content.from_dict(cont1).hashes()),
+ content_hex_hashes(Content.from_dict(cont1b).hashes())
+ ]
+ assert exc.colliding_content_hashes() == [
Content.from_dict(cont1).hashes(),
Content.from_dict(cont1b).hashes()
]
diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py
--- a/swh/storage/tests/test_utils.py
+++ b/swh/storage/tests/test_utils.py
@@ -3,33 +3,94 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.model.hashutil import hash_to_bytes
-from swh.storage.utils import extract_collision_hash
+from swh.model import hashutil
+from swh.storage.utils import (
+ extract_collision_hash, content_hex_hashes, content_bytes_hashes
+)
def test_extract_collision_hash():
for msg, expected_result in [
(
'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...',
- ('sha1', hash_to_bytes(
- '34973274ccef6ab4dfaaf86599792fa9c3fe4689')),
+ ('sha1', '34973274ccef6ab4dfaaf86599792fa9c3fe4689'),
),
(
'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa
- ('sha1_git', hash_to_bytes(
- '34973274ccef6ab4dfaaf86599792fa9c3fe4699')),
+ ('sha1_git',
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4699'),
),
(
'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa
- ('sha256', hash_to_bytes(
- '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa
+ ('sha256',
+ '673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a') # noqa
),
(
'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa
- ('blake2s', hash_to_bytes(
- 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa
+ ('blake2s',
+ 'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d') # noqa
),
]:
assert extract_collision_hash(msg) == expected_result
assert extract_collision_hash('Nothing matching') is None
+
+
+def test_content_hex_hashes():
+ input_content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hashutil.hash_to_bytes(
+ "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"),
+ }
+
+ expected_content = {
+ "blake2s256":
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2",
+ "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0",
+ "sha256":
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0",
+ "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ }
+
+ actual_content = content_hex_hashes(input_content)
+
+ assert len(actual_content) == len(expected_content)
+ for algo in hashutil.DEFAULT_ALGORITHMS:
+ assert actual_content[algo] == expected_content[algo]
+
+
+def test_content_bytes_hashes():
+ input_content = {
+ "blake2s256":
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2",
+ "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0",
+ "sha256":
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0",
+ "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ }
+
+ expected_content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hashutil.hash_to_bytes(
+ "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"),
+ }
+
+ actual_content = content_bytes_hashes(input_content)
+
+ assert len(actual_content) == len(expected_content)
+ for algo in hashutil.DEFAULT_ALGORITHMS:
+ assert actual_content[algo] == expected_content[algo]
diff --git a/swh/storage/utils.py b/swh/storage/utils.py
--- a/swh/storage/utils.py
+++ b/swh/storage/utils.py
@@ -5,9 +5,11 @@
import re
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import (
+ hash_to_bytes, hash_to_hex, DEFAULT_ALGORITHMS
+)
def _is_power_of_two(n: int) -> bool:
@@ -66,3 +68,21 @@
hash_id = result.group('id')
return hash_type, hash_to_bytes(hash_id)
return None
+
+
+def content_hex_hashes(content: Dict[str, bytes]) -> Dict[str, str]:
+ """Convert bytes hashes into hex hashes.
+
+ """
+ return {
+ algo: hash_to_hex(content[algo]) for algo in DEFAULT_ALGORITHMS
+ }
+
+
+def content_bytes_hashes(content: Dict[str, str]) -> Dict[str, bytes]:
+ """Convert bytes hashes into hex hashes.
+
+ """
+ return {
+ algo: hash_to_bytes(content[algo]) for algo in DEFAULT_ALGORITHMS
+ }

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 10:24 AM (3 d, 7 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223929

Event Timeline