Page MenuHomeSoftware Heritage

D2872.diff
No OneTemporary

D2872.diff

diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py
--- a/swh/storage/__init__.py
+++ b/swh/storage/__init__.py
@@ -6,10 +6,6 @@
import warnings
-class HashCollision(Exception):
- pass
-
-
STORAGE_IMPLEMENTATION = {
'pipeline', 'local', 'remote', 'memory', 'filter', 'buffer', 'retry',
'validate', 'cassandra',
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -8,8 +8,7 @@
from swh.core.api import RPCClient, RemoteException
from swh.model.model import Content
-from .. import HashCollision
-from ..exc import StorageAPIError, StorageArgumentException
+from ..exc import StorageAPIError, StorageArgumentException, HashCollision
from ..interface import StorageInterface
from .serializers import ENCODERS, DECODERS
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -21,8 +21,7 @@
from swh.storage.objstorage import ObjStorage
from swh.storage.writer import JournalWriter
-from .. import HashCollision
-from ..exc import StorageArgumentException
+from ..exc import StorageArgumentException, HashCollision
from .common import TOKEN_BEGIN, TOKEN_END
from .converters import (
revision_to_db, revision_from_db, release_to_db, release_from_db,
diff --git a/swh/storage/exc.py b/swh/storage/exc.py
--- a/swh/storage/exc.py
+++ b/swh/storage/exc.py
@@ -1,8 +1,14 @@
-# Copyright (C) 2015-2016 The Software Heritage developers
+# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Dict, List
+
+from swh.model.hashutil import hash_to_hex
+
+from swh.storage.utils import content_hex_hashes, content_bytes_hashes
+
class StorageDBError(Exception):
"""Specific storage db error (connection, erroneous queries, etc...)
@@ -26,3 +32,18 @@
class StorageArgumentException(Exception):
"""Argument passed to a Storage endpoint is invalid."""
pass
+
+
+class HashCollision(Exception):
+ """Exception raised when a content collides in a storage backend
+
+ """
+ def __init__(self, algo, hash_id, colliding_contents):
+ self.algo = algo
+ self.hash_id = hash_to_hex(hash_id)
+ self.colliding_contents = [content_hex_hashes(c)
+ for c in colliding_contents]
+ super().__init__(self.algo, self.hash_id, self.colliding_contents)
+
+ def colliding_content_hashes(self) -> List[Dict[str, bytes]]:
+ return [content_bytes_hashes(c) for c in self.colliding_contents]
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -25,8 +25,7 @@
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
from swh.storage.objstorage import ObjStorage
-from . import HashCollision
-from .exc import StorageArgumentException
+from .exc import StorageArgumentException, HashCollision
from .converters import origin_url_to_sha1
from .utils import get_partition_bounds_bytes
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -25,10 +25,10 @@
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
from swh.storage.objstorage import ObjStorage
-from . import converters, HashCollision
+from . import converters
from .common import db_transaction_generator, db_transaction
from .db import Db
-from .exc import StorageArgumentException, StorageDBError
+from .exc import StorageArgumentException, StorageDBError, HashCollision
from .algos import diff
from .metrics import timed, send_metric, process_metrics
from .utils import (
diff --git a/swh/storage/tests/test_exception.py b/swh/storage/tests/test_exception.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_exception.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model import hashutil
+
+from swh.storage.exc import HashCollision
+from swh.storage.utils import content_hex_hashes
+
+
+def test_hash_collision_exception():
+ hex_hash_id = "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ hash_id = hashutil.hash_to_bytes(hex_hash_id)
+
+ content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hash_id,
+ }
+
+ exc = HashCollision('sha1', hash_id, [content])
+
+ assert exc.algo == 'sha1'
+ assert exc.hash_id == hex_hash_id
+ assert exc.colliding_contents == [content_hex_hashes(content)]
+
+ assert exc.colliding_content_hashes() == [content]
diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py
--- a/swh/storage/tests/test_retry.py
+++ b/swh/storage/tests/test_retry.py
@@ -13,8 +13,14 @@
Content, Directory, Release, Revision, Snapshot, Origin
)
-from swh.storage import HashCollision, get_storage
-from swh.storage.exc import StorageArgumentException
+from swh.storage import get_storage
+from swh.storage.exc import HashCollision, StorageArgumentException
+
+
+@pytest.fixture
+def fake_hash_collision(sample_data):
+ return HashCollision(
+ 'sha1', "38762cf7f55934b34d179ae6a4c80cadccbb7f0a", [])
@pytest.fixture
@@ -50,7 +56,7 @@
def test_retrying_proxy_storage_content_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -58,7 +64,7 @@
'swh.storage.in_memory.InMemoryStorage.content_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('content hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('content already inserted'),
# ok then!
@@ -126,7 +132,7 @@
def test_retrying_proxy_storage_content_add_metadata_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -134,7 +140,7 @@
'swh.storage.in_memory.InMemoryStorage.content_add_metadata')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('content_metadata hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('content_metadata already inserted'),
# ok then!
@@ -196,7 +202,7 @@
def test_retrying_proxy_swh_storage_origin_add_one_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -205,7 +211,7 @@
'swh.storage.in_memory.InMemoryStorage.origin_add_one')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('origin hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('origin already inserted'),
# ok then!
@@ -270,7 +276,7 @@
def test_retrying_proxy_swh_storage_origin_visit_add_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -281,7 +287,7 @@
'swh.storage.in_memory.InMemoryStorage.origin_visit_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('origin hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('origin already inserted'),
# ok then!
@@ -349,7 +355,7 @@
def test_retrying_proxy_storage_tool_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -358,7 +364,7 @@
'swh.storage.in_memory.InMemoryStorage.tool_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('tool hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('tool already inserted'),
# ok then!
@@ -434,7 +440,7 @@
def test_retrying_proxy_storage_metadata_provider_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -445,7 +451,7 @@
'swh.storage.in_memory.InMemoryStorage.metadata_provider_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('provider_id hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('provider_id already inserted'),
# ok then!
@@ -518,7 +524,7 @@
def test_retrying_proxy_storage_origin_metadata_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -532,7 +538,7 @@
mock_memory.side_effect = [
# first try goes ko
- HashCollision('provider_id hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('provider_id already inserted'),
# ok then!
@@ -614,7 +620,7 @@
def test_retrying_proxy_swh_storage_origin_visit_update_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -625,7 +631,7 @@
'swh.storage.in_memory.InMemoryStorage.origin_visit_update')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('origin hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('origin already inserted'),
# ok then!
@@ -687,7 +693,7 @@
def test_retrying_proxy_storage_directory_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -695,7 +701,7 @@
'swh.storage.in_memory.InMemoryStorage.directory_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('directory hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('directory already inserted'),
# ok then!
@@ -763,7 +769,7 @@
def test_retrying_proxy_storage_revision_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -771,7 +777,7 @@
'swh.storage.in_memory.InMemoryStorage.revision_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('revision hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('revision already inserted'),
# ok then!
@@ -840,7 +846,7 @@
def test_retrying_proxy_storage_release_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -848,7 +854,7 @@
'swh.storage.in_memory.InMemoryStorage.release_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('release hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('release already inserted'),
# ok then!
@@ -917,7 +923,7 @@
def test_retrying_proxy_storage_snapshot_add_with_retry(
- swh_storage, sample_data, mocker):
+ swh_storage, sample_data, mocker, fake_hash_collision):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
@@ -925,7 +931,7 @@
'swh.storage.in_memory.InMemoryStorage.snapshot_add')
mock_memory.side_effect = [
# first try goes ko
- HashCollision('snapshot hash collision'),
+ fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError('snapshot already inserted'),
# ok then!
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -30,10 +30,12 @@
Content, OriginVisit, Release, Revision
)
from swh.model.hypothesis_strategies import objects
-from swh.storage import HashCollision, get_storage
+from swh.model.hashutil import hash_to_hex
+from swh.storage import get_storage
from swh.storage.converters import origin_url_to_sha1 as sha1
-from swh.storage.exc import StorageArgumentException
+from swh.storage.exc import HashCollision, StorageArgumentException
from swh.storage.interface import StorageInterface
+from swh.storage.utils import content_hex_hashes
from .storage_data import data
@@ -314,12 +316,18 @@
with pytest.raises(HashCollision) as cm:
swh_storage.content_add([cont1, cont1b])
- actual_algo = cm.value.args[0]
+ exc = cm.value
+ actual_algo = exc.algo
assert actual_algo in ['sha1', 'sha1_git', 'blake2s256']
- actual_id = cm.value.args[1]
- assert actual_id == cont1[actual_algo]
- assert len(cm.value.args[2]) == 2
- assert cm.value.args[2] == [
+ actual_id = exc.hash_id
+ assert actual_id == hash_to_hex(cont1[actual_algo])
+ collisions = exc.args[2]
+ assert len(collisions) == 2
+ assert collisions == [
+ content_hex_hashes(Content.from_dict(cont1).hashes()),
+ content_hex_hashes(Content.from_dict(cont1b).hashes())
+ ]
+ assert exc.colliding_content_hashes() == [
Content.from_dict(cont1).hashes(),
Content.from_dict(cont1b).hashes()
]
@@ -387,12 +395,18 @@
with pytest.raises(HashCollision) as cm:
swh_storage.content_add_metadata([cont1, cont1b])
- actual_algo = cm.value.args[0]
+ exc = cm.value
+ actual_algo = exc.algo
assert actual_algo in ['sha1', 'sha1_git', 'blake2s256']
- actual_id = cm.value.args[1]
- assert actual_id == cont1[actual_algo]
- assert len(cm.value.args[2]) == 2
- assert cm.value.args[2] == [
+ actual_id = exc.hash_id
+ assert actual_id == hash_to_hex(cont1[actual_algo])
+ collisions = exc.args[2]
+ assert len(collisions) == 2
+ assert collisions == [
+ content_hex_hashes(Content.from_dict(cont1).hashes()),
+ content_hex_hashes(Content.from_dict(cont1b).hashes())
+ ]
+ assert exc.colliding_content_hashes() == [
Content.from_dict(cont1).hashes(),
Content.from_dict(cont1b).hashes()
]
diff --git a/swh/storage/tests/test_utils.py b/swh/storage/tests/test_utils.py
--- a/swh/storage/tests/test_utils.py
+++ b/swh/storage/tests/test_utils.py
@@ -3,33 +3,95 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.model.hashutil import hash_to_bytes
-from swh.storage.utils import extract_collision_hash
+from swh.model import hashutil
+from swh.storage.utils import (
+ extract_collision_hash, content_hex_hashes, content_bytes_hashes
+)
def test_extract_collision_hash():
for msg, expected_result in [
(
'Key (sha1)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4689) ...',
- ('sha1', hash_to_bytes(
+ ('sha1', hashutil.hash_to_bytes(
'34973274ccef6ab4dfaaf86599792fa9c3fe4689')),
),
(
'Key (sha1_git)=(\\x34973274ccef6ab4dfaaf86599792fa9c3fe4699) already exists', # noqa
- ('sha1_git', hash_to_bytes(
+ ('sha1_git', hashutil.hash_to_bytes(
'34973274ccef6ab4dfaaf86599792fa9c3fe4699')),
),
(
'Key (sha256)=(\\x673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a) ...', # noqa
- ('sha256', hash_to_bytes(
+ ('sha256', hashutil.hash_to_bytes(
'673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a')) # noqa
),
(
'Key (blake2s)=(\\xd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d) ...', # noqa
- ('blake2s', hash_to_bytes(
+ ('blake2s', hashutil.hash_to_bytes(
'd5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d')) # noqa
),
]:
assert extract_collision_hash(msg) == expected_result
assert extract_collision_hash('Nothing matching') is None
+
+
+def test_content_hex_hashes():
+ input_content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hashutil.hash_to_bytes(
+ "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"),
+ }
+
+ expected_content = {
+ "blake2s256":
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2",
+ "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0",
+ "sha256":
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0",
+ "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ }
+
+ actual_content = content_hex_hashes(input_content)
+
+ assert len(actual_content) == len(expected_content)
+ for algo in hashutil.DEFAULT_ALGORITHMS:
+ assert actual_content[algo] == expected_content[algo]
+
+
+def test_content_bytes_hashes():
+ input_content = {
+ "blake2s256":
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2",
+ "sha1_git": "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0",
+ "sha256":
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0",
+ "sha1": "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
+ }
+
+ expected_content = {
+ "blake2s256": hashutil.hash_to_bytes(
+ "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
+ ),
+ "sha1_git": hashutil.hash_to_bytes(
+ "ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
+ "sha256": hashutil.hash_to_bytes(
+ "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
+ ),
+ "sha1": hashutil.hash_to_bytes(
+ "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"),
+ }
+
+ actual_content = content_bytes_hashes(input_content)
+
+ assert len(actual_content) == len(expected_content)
+ for algo in hashutil.DEFAULT_ALGORITHMS:
+ assert actual_content[algo] == expected_content[algo]
diff --git a/swh/storage/utils.py b/swh/storage/utils.py
--- a/swh/storage/utils.py
+++ b/swh/storage/utils.py
@@ -5,9 +5,11 @@
import re
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import (
+ hash_to_bytes, hash_to_hex, DEFAULT_ALGORITHMS
+)
def _is_power_of_two(n: int) -> bool:
@@ -66,3 +68,21 @@
hash_id = result.group('id')
return hash_type, hash_to_bytes(hash_id)
return None
+
+
+def content_hex_hashes(content: Dict[str, bytes]) -> Dict[str, str]:
+ """Convert bytes hashes into hex hashes.
+
+ """
+ return {
+ algo: hash_to_hex(content[algo]) for algo in DEFAULT_ALGORITHMS
+ }
+
+
+def content_bytes_hashes(content: Dict[str, str]) -> Dict[str, bytes]:
+ """Convert bytes hashes into hex hashes.
+
+ """
+ return {
+ algo: hash_to_bytes(content[algo]) for algo in DEFAULT_ALGORITHMS
+ }

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 2:39 AM (1 d, 22 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214171

Event Timeline