diff --git a/swh/objstorage/multiplexer/filter/__init__.py b/swh/objstorage/multiplexer/filter/__init__.py index 36fcd1f..198a11d 100644 --- a/swh/objstorage/multiplexer/filter/__init__.py +++ b/swh/objstorage/multiplexer/filter/__init__.py @@ -1,93 +1,82 @@ -from .id_filter import PrefixIdObjStorageFilter, RegexIdObjStorageFilter from .read_write_filter import ReadObjStorageFilter _FILTERS_CLASSES = { "readonly": ReadObjStorageFilter, - "regex": RegexIdObjStorageFilter, - "prefix": PrefixIdObjStorageFilter, } -_FILTERS_PRIORITY = {"readonly": 0, "prefix": 1, "regex": 2} +_FILTERS_PRIORITY = {"readonly": 0} def read_only(): return {"type": "readonly"} -def id_prefix(prefix): - return {"type": "prefix", "prefix": prefix} - - -def id_regex(regex): - return {"type": "regex", "regex": regex} - - def _filter_priority(filter_type): """Get the priority of this filter. Priority is a value that indicates if the operation of the filter is time-consuming (smaller values means quick execution), or very likely to be almost always the same value (False being small, and True high). In case the filters are chained, they will be ordered in a way that small priorities (quick execution or instantly break the chain) are executed first. Default value is 1. Value 0 is recommended for storages that change behavior only by disabling some operations (making the method return None). """ return _FILTERS_PRIORITY.get(filter_type, 1) def add_filter(storage, filter_conf): """Add a filter to the given storage. Args: storage (swh.objstorage.ObjStorage): storage which will be filtered. filter_conf (dict): configuration of an ObjStorageFilter, given as a dictionary that contains the keys: - type: which represent the type of filter, one of the keys of _FILTERS_CLASSES - Every arguments that this type of filter requires. Returns: A filtered storage that perform only the valid operations. """ type = filter_conf["type"] args = {k: v for k, v in filter_conf.items() if k != "type"} filtered_storage = _FILTERS_CLASSES[type](storage=storage, **args) return filtered_storage def add_filters(storage, filter_confs): """Add multiple filters to the given storage. (See filter.add_filter) Args: storage (swh.objstorage.ObjStorage): storage which will be filtered. filter_confs (list): any number of filter conf, as a dict with: - type: which represent the type of filter, one of the keys of FILTERS. - Every arguments that this type of filter require. Returns: A filtered storage that fulfill the requirement of all the given filters. """ # Reverse sorting in order to put the filter with biggest priority first. filter_confs.sort(key=lambda conf: _filter_priority(conf["type"]), reverse=True) # Add the bigest filter to the storage, and reduce it to accumulate filters # on top of it, until the smallest (fastest, see filter.filter_priority) is # added. for filter_conf in filter_confs: storage = add_filter(storage, filter_conf) return storage diff --git a/swh/objstorage/multiplexer/filter/id_filter.py b/swh/objstorage/multiplexer/filter/id_filter.py deleted file mode 100644 index 5955a2e..0000000 --- a/swh/objstorage/multiplexer/filter/id_filter.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (C) 2015-2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import abc -import re -from typing import Iterator - -from swh.model import hashutil -from swh.objstorage.exc import ObjNotFoundError -from swh.objstorage.interface import CompositeObjId -from swh.objstorage.multiplexer.filter.filter import ObjStorageFilter - - -class IdObjStorageFilter(ObjStorageFilter, metaclass=abc.ABCMeta): - """Filter that only allow operations if the object id match a requirement. - - Even for read operations, check before if the id match the requirements. - This may prevent for unnecessary disk access. - """ - - @abc.abstractmethod - def is_valid(self, obj_id): - """Indicates if the given id is valid.""" - raise NotImplementedError( - "Implementations of an IdObjStorageFilter " 'must have a "is_valid" method' - ) - - def __contains__(self, obj_id, *args, **kwargs): - if self.is_valid(obj_id): - return self.storage.__contains__(*args, obj_id=obj_id, **kwargs) - return False - - def __len__(self): - return sum(1 for i in [id for id in self.storage if self.is_valid(id)]) - - def __iter__(self) -> Iterator[CompositeObjId]: - yield from filter(lambda id: self.is_valid(id), iter(self.storage)) - - def add(self, content, obj_id, check_presence=True, *args, **kwargs): - if self.is_valid(obj_id): - return self.storage.add(content, *args, obj_id=obj_id, **kwargs) - - def restore(self, content, obj_id, *args, **kwargs): - if self.is_valid(obj_id): - return self.storage.restore(content, *args, obj_id=obj_id, **kwargs) - - def get(self, obj_id, *args, **kwargs): - if self.is_valid(obj_id): - return self.storage.get(*args, obj_id=obj_id, **kwargs) - raise ObjNotFoundError(obj_id) - - def check(self, obj_id, *args, **kwargs): - if self.is_valid(obj_id): - return self.storage.check(*args, obj_id=obj_id, **kwargs) - raise ObjNotFoundError(obj_id) - - -class RegexIdObjStorageFilter(IdObjStorageFilter): - """Filter that allow operations if the content's id as hex match a regex.""" - - def __init__(self, storage, regex): - super().__init__(storage) - self.regex = re.compile(regex) - - def is_valid(self, obj_id): - hex_obj_id = hashutil.hash_to_hex(obj_id) - return self.regex.match(hex_obj_id) is not None - - -class PrefixIdObjStorageFilter(IdObjStorageFilter): - """Filter that allow operations if the hexlified id have a given prefix.""" - - def __init__(self, storage, prefix): - super().__init__(storage) - self.prefix = str(prefix) - - def is_valid(self, obj_id): - hex_obj_id = hashutil.hash_to_hex(obj_id) - return str(hex_obj_id).startswith(self.prefix) diff --git a/swh/objstorage/tests/test_multiplexer_filter.py b/swh/objstorage/tests/test_multiplexer_filter.py deleted file mode 100644 index 215a140..0000000 --- a/swh/objstorage/tests/test_multiplexer_filter.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright (C) 2015-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import random -import shutil -from string import ascii_lowercase -import tempfile -import unittest - -from swh.model import hashutil -from swh.objstorage.exc import Error, ObjNotFoundError -from swh.objstorage.factory import get_objstorage -from swh.objstorage.multiplexer.filter import id_prefix, id_regex, read_only -from swh.objstorage.objstorage import compute_hash - - -def get_random_content(): - return bytes("".join(random.sample(ascii_lowercase, 10)), "utf8") - - -class MixinTestReadFilter(unittest.TestCase): - # Read only filter should not allow writing - - def setUp(self): - super().setUp() - self.tmpdir = tempfile.mkdtemp() - pstorage = { - "cls": "pathslicing", - "root": self.tmpdir, - "slicing": "0:5", - } - base_storage = get_objstorage(**pstorage) - self.storage = get_objstorage( - "filtered", storage_conf=pstorage, filters_conf=[read_only()] - ) - self.valid_content = b"pre-existing content" - self.invalid_content = b"invalid_content" - self.true_invalid_content = b"Anything that is not correct" - self.absent_content = b"non-existent content" - # Create a valid content. - self.valid_id = compute_hash(self.valid_content) - base_storage.add(self.valid_content, obj_id=self.valid_id) - # Create an invalid id and add a content with it. - self.invalid_id = compute_hash(self.true_invalid_content) - base_storage.add(self.invalid_content, obj_id=self.invalid_id) - # Compute an id for a non-existing content. - self.absent_id = compute_hash(self.absent_content) - - def tearDown(self): - super().tearDown() - shutil.rmtree(self.tmpdir) - - def test_can_contains(self): - self.assertTrue(self.valid_id in self.storage) - self.assertTrue(self.invalid_id in self.storage) - self.assertFalse(self.absent_id in self.storage) - - def test_can_iter(self): - self.assertIn(self.valid_id, iter(self.storage)) - self.assertIn(self.invalid_id, iter(self.storage)) - - def test_can_len(self): - self.assertEqual(2, len(self.storage)) - - def test_can_get(self): - self.assertEqual(self.valid_content, self.storage.get(self.valid_id)) - self.assertEqual(self.invalid_content, self.storage.get(self.invalid_id)) - - def test_can_check(self): - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.absent_id) - with self.assertRaises(Error): - self.storage.check(self.invalid_id) - self.storage.check(self.valid_id) - - def test_cannot_add(self): - new_id = self.storage.add(b"New content") - result = self.storage.add(self.valid_content, self.valid_id) - self.assertIsNone(new_id, self.storage) - self.assertIsNone(result) - - def test_cannot_restore(self): - result = self.storage.restore(self.valid_content, self.valid_id) - self.assertIsNone(result) - - -class MixinTestIdFilter: - """Mixin class that tests the filters based on filter.IdFilter - - Methods "make_valid", "make_invalid" and "filter_storage" must be - implemented by subclasses. - """ - - def setUp(self): - super().setUp() - # Use a hack here : as the mock uses the content as id, it is easy to - # create contents that are filtered or not. - self.prefix = "71" - self.tmpdir = tempfile.mkdtemp() - # Make the storage filtered - self.sconf = { - "cls": "pathslicing", - "root": self.tmpdir, - "slicing": "0:5", - } - storage = get_objstorage(**self.sconf) - self.base_storage = storage - self.storage = self.filter_storage(self.sconf) - - # Present content with valid id - self.present_valid_content = self.ensure_valid(b"yroqdtotji") - self.present_valid_id = compute_hash(self.present_valid_content) - - # Present content with invalid id - self.present_invalid_content = self.ensure_invalid(b"glxddlmmzb") - self.present_invalid_id = compute_hash(self.present_invalid_content) - - # Missing content with valid id - self.missing_valid_content = self.ensure_valid(b"rmzkdclkez") - self.missing_valid_id = compute_hash(self.missing_valid_content) - - # Missing content with invalid id - self.missing_invalid_content = self.ensure_invalid(b"hlejfuginh") - self.missing_invalid_id = compute_hash(self.missing_invalid_content) - - # Present corrupted content with valid id - self.present_corrupted_valid_content = self.ensure_valid(b"cdsjwnpaij") - self.true_present_corrupted_valid_content = self.ensure_valid(b"mgsdpawcrr") - self.present_corrupted_valid_id = compute_hash( - self.true_present_corrupted_valid_content - ) - - # Present corrupted content with invalid id - self.present_corrupted_invalid_content = self.ensure_invalid(b"pspjljnrco") - self.true_present_corrupted_invalid_content = self.ensure_invalid(b"rjocbnnbso") - self.present_corrupted_invalid_id = compute_hash( - self.true_present_corrupted_invalid_content - ) - - # Missing (potentially) corrupted content with valid id - self.missing_corrupted_valid_content = self.ensure_valid(b"zxkokfgtou") - self.true_missing_corrupted_valid_content = self.ensure_valid(b"royoncooqa") - self.missing_corrupted_valid_id = compute_hash( - self.true_missing_corrupted_valid_content - ) - - # Missing (potentially) corrupted content with invalid id - self.missing_corrupted_invalid_content = self.ensure_invalid(b"hxaxnrmnyk") - self.true_missing_corrupted_invalid_content = self.ensure_invalid(b"qhbolyuifr") - self.missing_corrupted_invalid_id = compute_hash( - self.true_missing_corrupted_invalid_content - ) - - # Add the content that are supposed to be present - self.storage.add(self.present_valid_content, obj_id=self.present_valid_id) - self.storage.add(self.present_invalid_content, obj_id=self.present_invalid_id) - self.storage.add( - self.present_corrupted_valid_content, obj_id=self.present_corrupted_valid_id - ) - self.storage.add( - self.present_corrupted_invalid_content, - obj_id=self.present_corrupted_invalid_id, - ) - - def tearDown(self): - super().tearDown() - shutil.rmtree(self.tmpdir) - - def filter_storage(self, sconf): - raise NotImplementedError( - "Id_filter test class must have a filter_storage method" - ) - - def ensure_valid(self, content=None): - if content is None: - content = get_random_content() - while not self.storage.is_valid(compute_hash(content)): - content = get_random_content() - return content - - def ensure_invalid(self, content=None): - if content is None: - content = get_random_content() - while self.storage.is_valid(compute_hash(content)): - content = get_random_content() - return content - - def test_contains(self): - # Both contents are present, but the invalid one should be ignored. - self.assertTrue(self.present_valid_id in self.storage) - self.assertFalse(self.present_invalid_id in self.storage) - self.assertFalse(self.missing_valid_id in self.storage) - self.assertFalse(self.missing_invalid_id in self.storage) - self.assertTrue(self.present_corrupted_valid_id in self.storage) - self.assertFalse(self.present_corrupted_invalid_id in self.storage) - self.assertFalse(self.missing_corrupted_valid_id in self.storage) - self.assertFalse(self.missing_corrupted_invalid_id in self.storage) - - def test_iter(self): - self.assertIn(self.present_valid_id, iter(self.storage)) - self.assertNotIn(self.present_invalid_id, iter(self.storage)) - self.assertNotIn(self.missing_valid_id, iter(self.storage)) - self.assertNotIn(self.missing_invalid_id, iter(self.storage)) - self.assertIn(self.present_corrupted_valid_id, iter(self.storage)) - self.assertNotIn(self.present_corrupted_invalid_id, iter(self.storage)) - self.assertNotIn(self.missing_corrupted_valid_id, iter(self.storage)) - self.assertNotIn(self.missing_corrupted_invalid_id, iter(self.storage)) - - def test_len(self): - # Four contents are present, but only two should be valid. - self.assertEqual(2, len(self.storage)) - - def test_get(self): - self.assertEqual( - self.present_valid_content, self.storage.get(self.present_valid_id) - ) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.present_invalid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.missing_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.missing_invalid_id) - self.assertEqual( - self.present_corrupted_valid_content, - self.storage.get(self.present_corrupted_valid_id), - ) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.present_corrupted_invalid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.missing_corrupted_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.get(self.missing_corrupted_invalid_id) - - def test_check(self): - self.storage.check(self.present_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.present_invalid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.missing_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.missing_invalid_id) - with self.assertRaises(Error): - self.storage.check(self.present_corrupted_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.present_corrupted_invalid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.missing_corrupted_valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(self.missing_corrupted_invalid_id) - - def test_add(self): - # Add valid and invalid contents to the storage and check their - # presence with the unfiltered storage. - valid_content = self.ensure_valid(b"ulepsrjbgt") - valid_id = compute_hash(valid_content) - invalid_content = self.ensure_invalid(b"znvghkjked") - invalid_id = compute_hash(invalid_content) - self.storage.add(valid_content, obj_id=valid_id) - self.storage.add(invalid_content, obj_id=invalid_id) - self.assertTrue(valid_id in self.base_storage) - self.assertFalse(invalid_id in self.base_storage) - - def test_restore(self): - # Add corrupted content to the storage and the try to restore it - valid_content = self.ensure_valid(b"ulepsrjbgt") - valid_id = compute_hash(valid_content) - corrupted_content = self.ensure_valid(b"ltjkjsloyb") - corrupted_id = compute_hash(corrupted_content) - self.storage.add(corrupted_content, obj_id=valid_id) - with self.assertRaises(ObjNotFoundError): - self.storage.check(corrupted_id) - with self.assertRaises(Error): - self.storage.check(valid_id) - self.storage.restore(valid_content, obj_id=valid_id) - self.storage.check(valid_id) - - -class TestPrefixFilter(MixinTestIdFilter, unittest.TestCase): - def setUp(self): - self.prefix = b"71" - super().setUp() - - def ensure_valid(self, content): - obj_id = compute_hash(content) - hex_obj_id = hashutil.hash_to_hex(obj_id) - self.assertTrue(hex_obj_id.startswith(self.prefix)) - return content - - def ensure_invalid(self, content): - obj_id = compute_hash(content) - hex_obj_id = hashutil.hash_to_hex(obj_id) - self.assertFalse(hex_obj_id.startswith(self.prefix)) - return content - - def filter_storage(self, sconf): - return get_objstorage( - "filtered", - storage_conf=sconf, - filters_conf=[id_prefix(self.prefix)], - ) - - -class TestRegexFilter(MixinTestIdFilter, unittest.TestCase): - def setUp(self): - self.regex = r"[a-f][0-9].*" - super().setUp() - - def filter_storage(self, sconf): - return get_objstorage( - "filtered", storage_conf=sconf, filters_conf=[id_regex(self.regex)] - ) diff --git a/swh/objstorage/tests/test_readonly_filter.py b/swh/objstorage/tests/test_readonly_filter.py new file mode 100644 index 0000000..39ef57c --- /dev/null +++ b/swh/objstorage/tests/test_readonly_filter.py @@ -0,0 +1,85 @@ +# Copyright (C) 2015-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import shutil +from string import ascii_lowercase +import tempfile +import unittest + +from swh.objstorage.exc import Error, ObjNotFoundError +from swh.objstorage.factory import get_objstorage +from swh.objstorage.multiplexer.filter import read_only +from swh.objstorage.objstorage import compute_hash + + +def get_random_content(): + return bytes("".join(random.sample(ascii_lowercase, 10)), "utf8") + + +class ReadOnlyFilterTestCase(unittest.TestCase): + # Read only filter should not allow writing + + def setUp(self): + super().setUp() + self.tmpdir = tempfile.mkdtemp() + pstorage = { + "cls": "pathslicing", + "root": self.tmpdir, + "slicing": "0:5", + } + base_storage = get_objstorage(**pstorage) + self.storage = get_objstorage( + "filtered", storage_conf=pstorage, filters_conf=[read_only()] + ) + self.valid_content = b"pre-existing content" + self.invalid_content = b"invalid_content" + self.true_invalid_content = b"Anything that is not correct" + self.absent_content = b"non-existent content" + # Create a valid content. + self.valid_id = compute_hash(self.valid_content) + base_storage.add(self.valid_content, obj_id=self.valid_id) + # Create an invalid id and add a content with it. + self.invalid_id = compute_hash(self.true_invalid_content) + base_storage.add(self.invalid_content, obj_id=self.invalid_id) + # Compute an id for a non-existing content. + self.absent_id = compute_hash(self.absent_content) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmpdir) + + def test_can_contains(self): + self.assertTrue(self.valid_id in self.storage) + self.assertTrue(self.invalid_id in self.storage) + self.assertFalse(self.absent_id in self.storage) + + def test_can_iter(self): + self.assertIn(self.valid_id, iter(self.storage)) + self.assertIn(self.invalid_id, iter(self.storage)) + + def test_can_len(self): + self.assertEqual(2, len(self.storage)) + + def test_can_get(self): + self.assertEqual(self.valid_content, self.storage.get(self.valid_id)) + self.assertEqual(self.invalid_content, self.storage.get(self.invalid_id)) + + def test_can_check(self): + with self.assertRaises(ObjNotFoundError): + self.storage.check(self.absent_id) + with self.assertRaises(Error): + self.storage.check(self.invalid_id) + self.storage.check(self.valid_id) + + def test_cannot_add(self): + new_id = self.storage.add(b"New content") + result = self.storage.add(self.valid_content, self.valid_id) + self.assertIsNone(new_id, self.storage) + self.assertIsNone(result) + + def test_cannot_restore(self): + result = self.storage.restore(self.valid_content, self.valid_id) + self.assertIsNone(result)