diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py index b8a3be08..e78b4de8 100644 --- a/swh/storage/__init__.py +++ b/swh/storage/__init__.py @@ -1,104 +1,105 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import importlib from typing import Any, Dict, List import warnings from .interface import StorageInterface STORAGE_IMPLEMENTATIONS = { "local": ".postgresql.storage.Storage", "remote": ".api.client.RemoteStorage", "memory": ".in_memory.InMemoryStorage", "filter": ".filter.FilteringProxyStorage", "buffer": ".buffer.BufferingProxyStorage", "retry": ".retry.RetryingProxyStorage", "cassandra": ".cassandra.CassandraStorage", + "validate": ".validate.ValidatingProxyStorage", } def get_storage(cls: str, **kwargs) -> StorageInterface: """Get a storage object of class `storage_class` with arguments `storage_args`. Args: storage (dict): dictionary with keys: - cls (str): storage's class, either local, remote, memory, filter, buffer - args (dict): dictionary with keys Returns: an instance of swh.storage.Storage or compatible class Raises: ValueError if passed an unknown storage class. """ if "args" in kwargs: warnings.warn( 'Explicit "args" key is deprecated, use keys directly instead.', DeprecationWarning, ) kwargs = kwargs["args"] if cls == "pipeline": return get_storage_pipeline(**kwargs) class_path = STORAGE_IMPLEMENTATIONS.get(cls) if class_path is None: raise ValueError( "Unknown storage class `%s`. Supported: %s" % (cls, ", ".join(STORAGE_IMPLEMENTATIONS)) ) (module_path, class_name) = class_path.rsplit(".", 1) module = importlib.import_module(module_path, package=__package__) Storage = getattr(module, class_name) check_config = kwargs.pop("check_config", {}) storage = Storage(**kwargs) if check_config: if not storage.check_config(**check_config): raise EnvironmentError("storage check config failed") return storage def get_storage_pipeline( steps: List[Dict[str, Any]], check_config=None ) -> StorageInterface: """Recursively get a storage object that may use other storage objects as backends. Args: steps (List[dict]): List of dicts that may be used as kwargs for `get_storage`. Returns: an instance of swh.storage.Storage or compatible class Raises: ValueError if passed an unknown storage class. """ storage_config = None for step in reversed(steps): if "args" in step: warnings.warn( 'Explicit "args" key is deprecated, use keys directly ' "instead.", DeprecationWarning, ) step = { "cls": step["cls"], **step["args"], } if storage_config: step["storage"] = storage_config step["check_config"] = check_config storage_config = step if storage_config is None: raise ValueError("'pipeline' has no steps.") return get_storage(**storage_config) diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py index 5cdf99af..fe904176 100644 --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -1,553 +1,553 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import attr from typing import Tuple from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model import from_disk from swh.model.identifiers import parse_swhid from swh.model.model import ( Content, Directory, DirectoryEntry, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, ObjectType, Origin, OriginVisit, Person, RawExtrinsicMetadata, Release, Revision, RevisionType, SkippedContent, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) class StorageData: """Data model objects to use within tests. """ content = Content( data=b"42\n", length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( - "673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a" + "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0" ), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" ), status="visible", ) content2 = Content( data=b"4242\n", length=5, sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"), sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"), sha256=hash_to_bytes( "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd" ), blake2s256=hash_to_bytes( "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d" ), status="visible", ) content3 = Content( data=b"424242\n", length=7, sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"), sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"), sha256=hash_to_bytes( "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36" ), blake2s256=hash_to_bytes( "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11" ), status="visible", ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc), ) contents: Tuple[Content, ...] = (content, content2, content3) skipped_content = SkippedContent( length=1024 * 1024 * 200, sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"), sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", origin="file:///dev/zero", ) skipped_content2 = SkippedContent( length=1024 * 1024 * 300, sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"), sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", ) skipped_contents: Tuple[SkippedContent, ...] = (skipped_content, skipped_content2) directory5 = Directory(entries=()) directory = Directory( - id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), + id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"), entries=tuple( [ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar\xc3", type="dir", target=directory5.id, perms=from_disk.DentryPerms.directory, ), ], ), ) directory2 = Directory( id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), entries=tuple( [ DirectoryEntry( name=b"oof", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ) ], ), ) directory3 = Directory( id=hash_to_bytes("4ea8c6b2f54445e5dd1a9d5bb2afd875d66f3150"), entries=tuple( [ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"subdir", type="dir", target=directory.id, perms=from_disk.DentryPerms.directory, ), DirectoryEntry( name=b"hello", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ), ], ), ) directory4 = Directory( id=hash_to_bytes("377aa5fcd944fbabf502dbfda55cd14d33c8c3c6"), entries=tuple( [ DirectoryEntry( name=b"subdir1", type="dir", target=directory3.id, perms=from_disk.DentryPerms.directory, ) ], ), ) directories: Tuple[Directory, ...] = ( directory2, directory, directory3, directory4, directory5, ) revision = Revision( - id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"), + id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"nicolas@example.com", fullname=b"Nicolas Dandrimont ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset=120, negative_utc=False, ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"stefano@example.com", fullname=b"St\xc3fano Zacchiroli ", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset=120, negative_utc=False, ), parents=(), type=RevisionType.GIT, directory=directory.id, metadata={ "checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",}, "signed-off-by": "some-dude", }, extra_headers=( (b"gpgsig", b"test123"), (b"mergetag", b"foo\\bar"), (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), ), synthetic=True, ) revision2 = Revision( id=hash_to_bytes("df7a6f6a99671fb7f7343641aff983a314ef6161"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"roberto@example.com", fullname=b"Roberto Dicosmo ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567843, microseconds=220000,), offset=-720, negative_utc=False, ), committer=Person( name=b"tony", email=b"ar@dumont.fr", fullname=b"tony ", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=220000,), offset=0, negative_utc=False, ), parents=tuple([revision.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=False, ) revision3 = Revision( id=hash_to_bytes("2cbd7bb22c653bbb23a29657852a50a01b591d46"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"roberto@example.com", fullname=b"Roberto Dicosmo ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567843, microseconds=220000,), offset=-720, negative_utc=False, ), committer=Person( name=b"tony", email=b"ar@dumont.fr", fullname=b"tony ", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1127351742, microseconds=220000,), offset=0, negative_utc=False, ), parents=tuple([revision.id, revision2.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=True, ) revision4 = Revision( id=hash_to_bytes("88cd5126fc958ed70089d5340441a1c2477bcc20"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"me@soft.heri", fullname=b"me ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567843, microseconds=220000,), offset=-720, negative_utc=False, ), committer=Person( name=b"committer-dude", email=b"committer@dude.com", fullname=b"committer-dude ", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1244567843, microseconds=220000,), offset=-720, negative_utc=False, ), parents=tuple([revision3.id]), type=RevisionType.GIT, directory=directory.id, metadata=None, extra_headers=(), synthetic=False, ) revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) origins: Tuple[Origin, ...] = ( Origin(url="https://github.com/user1/repo1"), Origin(url="https://github.com/user2/repo1"), Origin(url="https://github.com/user3/repo1"), Origin(url="https://gitlab.com/user1/repo1"), Origin(url="https://gitlab.com/user2/repo1"), Origin(url="https://forge.softwareheritage.org/source/repo1"), ) origin, origin2 = origins[:2] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="http://hal.inria.example.com/", metadata={"location": "France"}, ) metadata_authority2 = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="http://wikidata.example.com/", metadata={}, ) authorities: Tuple[MetadataAuthority, ...] = ( metadata_authority, metadata_authority2, ) metadata_fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", metadata={"sword_version": "2"}, ) metadata_fetcher2 = MetadataFetcher( name="swh-example", version="0.0.1", metadata={}, ) fetchers: Tuple[MetadataFetcher, ...] = (metadata_fetcher, metadata_fetcher2) date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit2 = datetime.datetime(2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit3 = datetime.datetime(2018, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) type_visit1 = "git" type_visit2 = "hg" type_visit3 = "deb" origin_visit = OriginVisit( origin=origin.url, visit=1, date=date_visit1, type=type_visit1, ) origin_visit2 = OriginVisit( origin=origin.url, visit=2, date=date_visit2, type=type_visit1, ) origin_visit3 = OriginVisit( origin=origin2.url, visit=1, date=date_visit1, type=type_visit2, ) origin_visits: Tuple[OriginVisit, ...] = ( origin_visit, origin_visit2, origin_visit3, ) release = Release( - id=hash_to_bytes("a673e617fcc6234e29b2cad06b8245f96c415c61"), + id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"), name=b"v0.0.1", author=Person( name=b"olasd", email=b"nic@olasd.fr", fullname=b"olasd ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset=42, negative_utc=False, ), target=revision.id, target_type=ObjectType.REVISION, message=b"synthetic release", synthetic=True, ) release2 = Release( id=hash_to_bytes("6902bd4c82b7d19a421d224aedab2b74197e420d"), name=b"v0.0.2", author=Person( name=b"tony", email=b"ar@dumont.fr", fullname=b"tony ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset=-120, negative_utc=False, ), target=revision2.id, target_type=ObjectType.REVISION, message=b"v0.0.2\nMisc performance improvements + bug fixes", synthetic=False, ) release3 = Release( id=hash_to_bytes("3e9050196aa288264f2a9d279d6abab8b158448b"), name=b"v0.0.2", author=Person( name=b"tony", email=b"tony@ardumont.fr", fullname=b"tony ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset=-120, negative_utc=False, ), target=revision3.id, target_type=ObjectType.REVISION, message=b"yet another synthetic release", synthetic=True, ) releases: Tuple[Release, ...] = (release, release2, release3) snapshot = Snapshot( - id=hash_to_bytes("409ee1ff3f10d166714bc90581debfd0446dda57"), + id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"), branches={ b"master": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), }, ) empty_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) complete_snapshot = Snapshot( id=hash_to_bytes("a56ce2d81c190023bb99a3a36279307522cb85f6"), branches={ b"directory": SnapshotBranch( target=directory.id, target_type=TargetType.DIRECTORY, ), b"directory2": SnapshotBranch( target=directory2.id, target_type=TargetType.DIRECTORY, ), b"content": SnapshotBranch( target=content.sha1_git, target_type=TargetType.CONTENT, ), b"alias": SnapshotBranch(target=b"revision", target_type=TargetType.ALIAS,), b"revision": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), b"release": SnapshotBranch( target=release.id, target_type=TargetType.RELEASE, ), b"snapshot": SnapshotBranch( target=empty_snapshot.id, target_type=TargetType.SNAPSHOT, ), b"dangling": None, }, ) snapshots: Tuple[Snapshot, ...] = (snapshot, empty_snapshot, complete_snapshot) content_metadata1 = RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, id=parse_swhid(f"swh:1:cnt:{hash_to_hex(content.sha1_git)}"), origin=origin.url, discovery_date=datetime.datetime( 2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc ), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="json", metadata=b'{"foo": "bar"}', ) content_metadata2 = RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, id=parse_swhid(f"swh:1:cnt:{hash_to_hex(content.sha1_git)}"), origin=origin2.url, discovery_date=datetime.datetime( 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="yaml", metadata=b"foo: bar", ) content_metadata3 = RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, id=parse_swhid(f"swh:1:cnt:{hash_to_hex(content.sha1_git)}"), discovery_date=datetime.datetime( 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", origin=origin.url, visit=42, snapshot=parse_swhid(f"swh:1:snp:{hash_to_hex(snapshot.id)}"), release=parse_swhid(f"swh:1:rel:{hash_to_hex(release.id)}"), revision=parse_swhid(f"swh:1:rev:{hash_to_hex(revision.id)}"), directory=parse_swhid(f"swh:1:dir:{hash_to_hex(directory.id)}"), path=b"/foo/bar", ) content_metadata: Tuple[RawExtrinsicMetadata, ...] = ( content_metadata1, content_metadata2, content_metadata3, ) origin_metadata1 = RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, id=origin.url, discovery_date=datetime.datetime( 2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc ), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="json", metadata=b'{"foo": "bar"}', ) origin_metadata2 = RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, id=origin.url, discovery_date=datetime.datetime( 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata3 = RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, id=origin.url, discovery_date=datetime.datetime( 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata: Tuple[RawExtrinsicMetadata, ...] = ( origin_metadata1, origin_metadata2, origin_metadata3, ) diff --git a/swh/storage/tests/test_validate.py b/swh/storage/tests/test_validate.py new file mode 100644 index 00000000..1e430d99 --- /dev/null +++ b/swh/storage/tests/test_validate.py @@ -0,0 +1,137 @@ +# Copyright (C) 2019-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import attr + +import pytest + +from swh.model.hashutil import hash_to_hex + +from swh.storage import get_storage +from swh.storage.exc import StorageArgumentException + + +@pytest.fixture +def swh_storage(): + storage_config = { + "cls": "pipeline", + "steps": [{"cls": "validate"}, {"cls": "memory"},], + } + + return get_storage(**storage_config) + + +def test_validating_proxy_storage_content(swh_storage, sample_data): + sample_content = sample_data.content + + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None + + with pytest.raises(StorageArgumentException, match="hashes"): + s = swh_storage.content_add([attr.evolve(sample_content, sha1=b"a" * 20)]) + + content = swh_storage.content_get_data(sample_content.sha1) + assert content is None + + s = swh_storage.content_add([sample_content]) + assert s == { + "content:add": 1, + "content:add:bytes": sample_content.length, + } + + content = swh_storage.content_get_data(sample_content.sha1) + assert content is not None + + +def test_validating_proxy_storage_skipped_content(swh_storage, sample_data): + sample_content = sample_data.skipped_content + sample_content = attr.evolve(sample_content, sha1=b"a" * 20) + sample_content_dict = sample_content.to_dict() + + s = swh_storage.skipped_content_add([sample_content]) + + content = list(swh_storage.skipped_content_missing([sample_content_dict])) + assert content == [] + + s = swh_storage.skipped_content_add([sample_content]) + assert s == { + "skipped_content:add": 0, + } + + +def test_validating_proxy_storage_directory(swh_storage, sample_data): + sample_directory = sample_data.directory + id_ = hash_to_hex(sample_directory.id) + + assert swh_storage.directory_missing([sample_directory.id]) == [sample_directory.id] + + with pytest.raises(StorageArgumentException, match=f"should be {id_}"): + s = swh_storage.directory_add([attr.evolve(sample_directory, id=b"a" * 20)]) + + assert swh_storage.directory_missing([sample_directory.id]) == [sample_directory.id] + + s = swh_storage.directory_add([sample_directory]) + assert s == { + "directory:add": 1, + } + + assert swh_storage.directory_missing([sample_directory.id]) == [] + + +def test_validating_proxy_storage_revision(swh_storage, sample_data): + sample_revision = sample_data.revision + id_ = hash_to_hex(sample_revision.id) + + assert swh_storage.revision_missing([sample_revision.id]) == [sample_revision.id] + + with pytest.raises(StorageArgumentException, match=f"should be {id_}"): + s = swh_storage.revision_add([attr.evolve(sample_revision, id=b"a" * 20)]) + + assert swh_storage.revision_missing([sample_revision.id]) == [sample_revision.id] + + s = swh_storage.revision_add([sample_revision]) + assert s == { + "revision:add": 1, + } + + assert swh_storage.revision_missing([sample_revision.id]) == [] + + +def test_validating_proxy_storage_release(swh_storage, sample_data): + sample_release = sample_data.release + id_ = hash_to_hex(sample_release.id) + + assert swh_storage.release_missing([sample_release.id]) == [sample_release.id] + + with pytest.raises(StorageArgumentException, match=f"should be {id_}"): + s = swh_storage.release_add([attr.evolve(sample_release, id=b"a" * 20)]) + + assert swh_storage.release_missing([sample_release.id]) == [sample_release.id] + + s = swh_storage.release_add([sample_release]) + assert s == { + "release:add": 1, + } + + assert swh_storage.release_missing([sample_release.id]) == [] + + +def test_validating_proxy_storage_snapshot(swh_storage, sample_data): + sample_snapshot = sample_data.snapshot + id_ = hash_to_hex(sample_snapshot.id) + + assert swh_storage.snapshot_missing([sample_snapshot.id]) == [sample_snapshot.id] + + with pytest.raises(StorageArgumentException, match=f"should be {id_}"): + s = swh_storage.snapshot_add([attr.evolve(sample_snapshot, id=b"a" * 20)]) + + assert swh_storage.snapshot_missing([sample_snapshot.id]) == [sample_snapshot.id] + + s = swh_storage.snapshot_add([sample_snapshot]) + assert s == { + "snapshot:add": 1, + } + + assert swh_storage.snapshot_missing([sample_snapshot.id]) == [] diff --git a/swh/storage/validate.py b/swh/storage/validate.py new file mode 100644 index 00000000..eab18255 --- /dev/null +++ b/swh/storage/validate.py @@ -0,0 +1,78 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Dict, Iterable, List + +from swh.model.hashutil import MultiHash, hash_to_bytes, hash_to_hex +from swh.model.model import ( + Content, + Directory, + Release, + Revision, + Snapshot, +) + +from swh.storage import get_storage +from swh.storage.exc import StorageArgumentException +from swh.storage.interface import StorageInterface + + +class ValidatingProxyStorage: + """Proxy for storage classes, which checks inserted objects have a correct hash. + + Sample configuration use case for filtering storage: + + .. code-block: yaml + + storage: + cls: validate + storage: + cls: remote + url: http://storage.internal.staging.swh.network:5002/ + + """ + + def __init__(self, storage): + self.storage: StorageInterface = get_storage(**storage) + + def __getattr__(self, key): + if key == "storage": + raise AttributeError(key) + return getattr(self.storage, key) + + def _check_hashes(self, objects: Iterable): + for obj in objects: + id_ = hash_to_bytes(obj.__class__.compute_hash(obj.to_dict())) + if id_ != obj.id: + raise StorageArgumentException( + f"Object has id {hash_to_hex(obj.id)}, " + f"but it should be {hash_to_hex(id_)}: {obj}" + ) + + def content_add(self, content: List[Content]) -> Dict: + for cont in content: + hashes = MultiHash.from_data(cont.data).digest() + if hashes != cont.hashes(): + raise StorageArgumentException( + f"Object has hashes {cont.hashes()}, but they should be {hashes}" + ) + return self.storage.content_add(content) + + def directory_add(self, directories: List[Directory]) -> Dict: + self._check_hashes(directories) + return self.storage.directory_add(directories) + + def revision_add(self, revisions: List[Revision]) -> Dict: + self._check_hashes(revisions) + return self.storage.revision_add(revisions) + + def release_add(self, releases: List[Release]) -> Dict: + self._check_hashes(releases) + return self.storage.release_add(releases) + + def snapshot_add(self, snapshots: List[Snapshot]) -> Dict: + self._check_hashes(snapshots) + return self.storage.snapshot_add(snapshots)