diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py --- a/swh/storage/__init__.py +++ b/swh/storage/__init__.py @@ -18,6 +18,7 @@ "buffer": ".buffer.BufferingProxyStorage", "retry": ".retry.RetryingProxyStorage", "cassandra": ".cassandra.CassandraStorage", + "validate": ".validate.ValidatingProxyStorage", } diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -48,7 +48,7 @@ sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( - "673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a" + "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0" ), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" @@ -115,7 +115,7 @@ directory5 = Directory(entries=()) directory = Directory( - id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), + id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"), entries=tuple( [ DirectoryEntry( @@ -193,7 +193,7 @@ ) revision = Revision( - id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"), + id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", @@ -372,7 +372,7 @@ ) release = Release( - id=hash_to_bytes("a673e617fcc6234e29b2cad06b8245f96c415c61"), + id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"), name=b"v0.0.1", author=Person( name=b"olasd", email=b"nic@olasd.fr", fullname=b"olasd ", @@ -425,7 +425,7 @@ releases: Tuple[Release, ...] = (release, release2, release3) snapshot = Snapshot( - id=hash_to_bytes("409ee1ff3f10d166714bc90581debfd0446dda57"), + id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"), branches={ b"master": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, diff --git a/swh/storage/tests/test_validate.py b/swh/storage/tests/test_validate.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/test_validate.py @@ -0,0 +1,137 @@ +# Copyright (C) 2019-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import attr + +import pytest + +from swh.model.hashutil import hash_to_hex + +from swh.storage import get_storage +from swh.storage.exc import StorageArgumentException + + +@pytest.fixture +def swh_storage(): + storage_config = { + "cls": "pipeline", + "steps": [{"cls": "validate"}, {"cls": "memory"},], + } + + return get_storage(**storage_config) + + +def test_validating_proxy_storage_content(swh_storage, sample_data): + sample_content = sample_data.content + + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None + + with pytest.raises(StorageArgumentException, match="hashes"): + s = swh_storage.content_add([attr.evolve(sample_content, sha1=b"a" * 20)]) + + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None + + s = swh_storage.content_add([sample_content]) + assert s == { + "content:add": 1, + "content:add:bytes": sample_content.length, + } + + content = swh_storage.content_get_data(sample_content.sha1) + assert content is not None + + +def test_validating_proxy_storage_skipped_content(swh_storage, sample_data): + sample_content = sample_data.skipped_content + sample_content = attr.evolve(sample_content, sha1=b"a" * 20) + sample_content_dict = sample_content.to_dict() + + s = swh_storage.skipped_content_add([sample_content]) + + content = list(swh_storage.skipped_content_missing([sample_content_dict])) + assert content == [] + + s = swh_storage.skipped_content_add([sample_content]) + assert s == { + "skipped_content:add": 0, + } + + +def test_validating_proxy_storage_directory(swh_storage, sample_data): + sample_directory = sample_data.directory + id_ = hash_to_hex(sample_directory.id) + + assert swh_storage.directory_missing([sample_directory.id]) == [sample_directory.id] + + with pytest.raises(StorageArgumentException, match=f"should be {id_}"): + s = swh_storage.directory_add([attr.evolve(sample_directory, id=b"a" * 20)]) + + assert swh_storage.directory_missing([sample_directory.id]) == [sample_directory.id] + + s = swh_storage.directory_add([sample_directory]) + assert s == { + "directory:add": 1, + } + + assert swh_storage.directory_missing([sample_directory.id]) == [] + + +def test_validating_proxy_storage_revision(swh_storage, sample_data): + sample_revision = sample_data.revision + id_ = hash_to_hex(sample_revision.id) + + assert swh_storage.revision_missing([sample_revision.id]) == [sample_revision.id] + + with pytest.raises(StorageArgumentException, match=f"should be {id_}"): + s = swh_storage.revision_add([attr.evolve(sample_revision, id=b"a" * 20)]) + + assert swh_storage.revision_missing([sample_revision.id]) == [sample_revision.id] + + s = swh_storage.revision_add([sample_revision]) + assert s == { + "revision:add": 1, + } + + assert swh_storage.revision_missing([sample_revision.id]) == [] + + +def test_validating_proxy_storage_release(swh_storage, sample_data): + sample_release = sample_data.release + id_ = hash_to_hex(sample_release.id) + + assert swh_storage.release_missing([sample_release.id]) == [sample_release.id] + + with pytest.raises(StorageArgumentException, match=f"should be {id_}"): + s = swh_storage.release_add([attr.evolve(sample_release, id=b"a" * 20)]) + + assert swh_storage.release_missing([sample_release.id]) == [sample_release.id] + + s = swh_storage.release_add([sample_release]) + assert s == { + "release:add": 1, + } + + assert swh_storage.release_missing([sample_release.id]) == [] + + +def test_validating_proxy_storage_snapshot(swh_storage, sample_data): + sample_snapshot = sample_data.snapshot + id_ = hash_to_hex(sample_snapshot.id) + + assert swh_storage.snapshot_missing([sample_snapshot.id]) == [sample_snapshot.id] + + with pytest.raises(StorageArgumentException, match=f"should be {id_}"): + s = swh_storage.snapshot_add([attr.evolve(sample_snapshot, id=b"a" * 20)]) + + assert swh_storage.snapshot_missing([sample_snapshot.id]) == [sample_snapshot.id] + + s = swh_storage.snapshot_add([sample_snapshot]) + assert s == { + "snapshot:add": 1, + } + + assert swh_storage.snapshot_missing([sample_snapshot.id]) == [] diff --git a/swh/storage/validate.py b/swh/storage/validate.py new file mode 100644 --- /dev/null +++ b/swh/storage/validate.py @@ -0,0 +1,78 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Dict, Iterable, List + +from swh.model.hashutil import MultiHash, hash_to_bytes, hash_to_hex +from swh.model.model import ( + Content, + Directory, + Release, + Revision, + Snapshot, +) + +from swh.storage import get_storage +from swh.storage.exc import StorageArgumentException +from swh.storage.interface import StorageInterface + + +class ValidatingProxyStorage: + """Proxy for storage classes, which checks inserted objects have a correct hash. + + Sample configuration use case for filtering storage: + + .. code-block: yaml + + storage: + cls: validate + storage: + cls: remote + url: http://storage.internal.staging.swh.network:5002/ + + """ + + def __init__(self, storage): + self.storage: StorageInterface = get_storage(**storage) + + def __getattr__(self, key): + if key == "storage": + raise AttributeError(key) + return getattr(self.storage, key) + + def _check_hashes(self, objects: Iterable): + for obj in objects: + id_ = hash_to_bytes(obj.__class__.compute_hash(obj.to_dict())) + if id_ != obj.id: + raise StorageArgumentException( + f"Object has id {hash_to_hex(obj.id)}, " + f"but it should be {hash_to_hex(id_)}: {obj}" + ) + + def content_add(self, content: List[Content]) -> Dict: + for cont in content: + hashes = MultiHash.from_data(cont.data).digest() + if hashes != cont.hashes(): + raise StorageArgumentException( + f"Object has hashes {cont.hashes()}, but they should be {hashes}" + ) + return self.storage.content_add(content) + + def directory_add(self, directories: List[Directory]) -> Dict: + self._check_hashes(directories) + return self.storage.directory_add(directories) + + def revision_add(self, revisions: List[Revision]) -> Dict: + self._check_hashes(revisions) + return self.storage.revision_add(revisions) + + def release_add(self, releases: List[Release]) -> Dict: + self._check_hashes(releases) + return self.storage.release_add(releases) + + def snapshot_add(self, snapshots: List[Snapshot]) -> Dict: + self._check_hashes(snapshots) + return self.storage.snapshot_add(snapshots)